# Cleaning "title_basics.tsv" into "movie_titles"

## Importing packages

In [None]:
import pandas as pd
import matplotlib
import numpy
import pathlib as path

## Importing Files

In [2]:
rel_path = path.Path().absolute()
#Importing the dataframe by every 1 million tuples because the full dataset is too large for one dataframe
first_ten_thou = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t', nrows = 1000000)
sec_ten_thou = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t',skiprows= range(1, 1000000), nrows = 1000000)
thir_ten_thou = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t',skiprows= range(1, 2000000), nrows = 1000000)
four_ten_thou = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t',skiprows= range(1, 3000000), nrows = 1000000)
five_ten_thou = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t',skiprows= range(1, 4000000), nrows = 1000000)
six_ten_thou = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t',skiprows= range(1, 5000000), nrows = 1000000)
fin_ten_thou = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t',skiprows= range(1, 6000000), nrows = 1000000)
original_tuples = len(first_ten_thou) + len(sec_ten_thou) + len(thir_ten_thou) + len(four_ten_thou) + len(five_ten_thou) + len(six_ten_thou) + len(fin_ten_thou)
original_size = first_ten_thou.size + sec_ten_thou.size + thir_ten_thou.size + four_ten_thou.size + five_ten_thou.size + six_ten_thou.size + fin_ten_thou.size

  interactivity=interactivity, compiler=compiler, result=result)


## Cleaning Data

### Removing all non-movie titles

In [6]:
first_ten_thou = first_ten_thou[(first_ten_thou['titleType'].str.contains('movie', case = False)) | (first_ten_thou['titleType'].str.contains('hort', case = False))]
sec_ten_thou = sec_ten_thou[(sec_ten_thou['titleType'].str.contains('movie', case = False)) | (sec_ten_thou['titleType'].str.contains('hort', case = False))]
thir_ten_thou = thir_ten_thou[((thir_ten_thou['titleType'].str.contains('movie', case = False))  | (thir_ten_thou['titleType'].str.contains('hort', case = False)))]
four_ten_thou = four_ten_thou[(four_ten_thou['titleType'].str.contains('movie', case = False)) | (four_ten_thou['titleType'].str.contains('hort', case = False))]
five_ten_thou = five_ten_thou[(five_ten_thou['titleType'].str.contains('movie', case = False)) | (five_ten_thou['titleType'].str.contains('hort', case = False))]
six_ten_thou = six_ten_thou[(six_ten_thou['titleType'].str.contains('movie', case = False)) | (six_ten_thou['titleType'].str.contains('hort', case = False))]
fin_ten_thou = fin_ten_thou[(fin_ten_thou['titleType'].str.contains('movie', case = False)) | (fin_ten_thou['titleType'].str.contains('hort', case = False))]

### Remerging original dataset

In [7]:
movie_titles = pd.concat([
    first_ten_thou,
    sec_ten_thou,
    thir_ten_thou,
    four_ten_thou,
    five_ten_thou,
    six_ten_thou,
    fin_ten_thou
])

### Dropping endYear

In [8]:
del movie_titles['endYear']

### Replacing \N

In [9]:
movie_titles = movie_titles.copy()
movie_titles['startYear'] = pd.to_numeric(movie_titles['startYear'], errors = 'coerce')
movie_titles['runtimeMinutes'] = pd.to_numeric(movie_titles['runtimeMinutes'], errors = 'coerce')
movie_titles = movie_titles.fillna(0)

In [11]:
print(len(movie_titles[movie_titles['isAdult'] == 1]))
print(len(movie_titles[movie_titles['titleType'].str.contains('tv', case = False)]))
# movie_titles

9998
132704


# Exporting Data

In [14]:
movie_titles.to_csv(str(rel_path) + r'\..\src\movie_titles.csv')
new_tuples = len(movie_titles)
new_size = movie_titles.size

# Metrics

In [13]:
print("Original Tuple Count:\t"+str(original_tuples))
print("Cleansed Tuple Count:\t"+ str(new_tuples))
percent_tuple_reduction = (100 - round((100 * new_tuples) / original_tuples, 2))
print("Tuples Reduced By: \t"+ str(percent_tuple_reduction)+ "%")
percent_size_reduction = (100 - round((100 * new_size) / original_size, 2))
print('Size Reduced By: \t' + str(percent_size_reduction) + "%")

movies_only = movie_titles[movie_titles['titleType'] == 'movie']
del movies_only['titleType']

no_porn = movie_titles[movie_titles['isAdult'] == 0]
del no_porn['isAdult']

both = movies_only[movies_only['isAdult'] == 0]
del both['isAdult']
print('Size Reduction: \t' + str(100 - round((100 * movies_only.size) / original_size, 2))+"%")
print('Size Reduction: \t' + str(100 - round((100 * no_porn.size) / original_size, 2))+"%")
print('Size Reduction: \t' + str(100 - round((100 * both.size) / original_size, 2))+"%")


Original Tuple Count:	6561312
Cleansed Tuple Count:	1401809
Tuples Reduced By: 	78.64%
Size Reduced By: 	81.01%
Size Reduction: 	93.55%
Size Reduction: 	83.5%
Size Reduction: 	94.56%
