# Cleaning "title_basics.tsv" into "movie_titles"

## Importing packages

In [1]:
import pandas as pd
import matplotlib
import numpy
import pathlib as path

## Importing Files

In [2]:
rel_path = path.Path().absolute()
#Importing the dataframe by every 1 million tuples because the full dataset is too large for one dataframe
try:
    first_ten_thou = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t', nrows = 1000000)
    sec_ten_thou = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t',skiprows= range(1, 1000000), nrows = 1000000)
    thir_ten_thou = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t',skiprows= range(1, 2000000), nrows = 1000000)
    four_ten_thou = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t',skiprows= range(1, 3000000), nrows = 1000000)
    five_ten_thou = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t',skiprows= range(1, 4000000), nrows = 1000000)
    six_ten_thou = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t',skiprows= range(1, 5000000), nrows = 1000000)
    fin_ten_thou = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t',skiprows= range(1, 6000000), nrows = 1000000)
    original_tuples = len(first_ten_thou) + len(sec_ten_thou) + len(thir_ten_thou) + len(four_ten_thou) + len(five_ten_thou) + len(six_ten_thou) + len(fin_ten_thou)
    original_size = first_ten_thou.size + sec_ten_thou.size + thir_ten_thou.size + four_ten_thou.size + five_ten_thou.size + six_ten_thou.size + fin_ten_thou.size
except:
    print("You're massing the title_basics.tsv file. Download it from the following link and unzip it to the raw_data folder. Then restart the kernel and rerun")
    print("https://datasets.imdbws.com/")

  interactivity=interactivity, compiler=compiler, result=result)


## Cleaning Data

### Removing all non-movie titles

In [26]:
first_ten_thou = first_ten_thou[(first_ten_thou['titleType']== 'movie')]
sec_ten_thou = sec_ten_thou[(sec_ten_thou['titleType']== 'movie')]
thir_ten_thou = thir_ten_thou[(thir_ten_thou['titleType']== 'movie')]
four_ten_thou = four_ten_thou[(four_ten_thou['titleType']== 'movie')]
five_ten_thou = five_ten_thou[(five_ten_thou['titleType']== 'movie')]
six_ten_thou = six_ten_thou[(six_ten_thou['titleType']== 'movie')]
fin_ten_thou = fin_ten_thou[(fin_ten_thou['titleType']== 'movie')]

### Remerging original dataset

In [27]:
movie_titles = pd.concat([
    first_ten_thou,
    sec_ten_thou,
    thir_ten_thou,
    four_ten_thou,
    five_ten_thou,
    six_ten_thou,
    fin_ten_thou
])

### Dropping extraneous columns

In [28]:
del movie_titles['endYear']
del movie_titles['titleType']

### Replacing \N

In [29]:
movie_titles = movie_titles.copy()
movie_titles['startYear'] = pd.to_numeric(movie_titles['startYear'], errors = 'coerce')
movie_titles['runtimeMinutes'] = pd.to_numeric(movie_titles['runtimeMinutes'], errors = 'coerce')
movie_titles = movie_titles.fillna(0)

# Exporting Data

In [30]:
movie_titles.to_csv(str(rel_path) + r'\..\src\movie_titles.csv')
new_tuples = len(movie_titles)
new_size = movie_titles.size

# Metrics

In [34]:
print("Original Tuple Count:\t"+str(original_tuples))
print("Cleansed Tuple Count:\t"+ str(new_tuples))
percent_tuple_reduction = (100 - round((100 * new_tuples) / original_tuples, 2))
print("Tuples Reduced By: \t"+ str(percent_tuple_reduction)+ "%")
percent_size_reduction = (100 - round((100 * new_size) / original_size, 2))
print('Size Reduced By: \t' + str(percent_size_reduction) + "%")

Original Tuple Count:	6561312
Cleansed Tuple Count:	544052
Tuples Reduced By: 	91.71000000000001%
Size Reduced By: 	93.55%
Size Reduction: 	94.56%
