# Cleaning "title_basics.tsv" into "movie_titles"

## Importing packages

In [None]:
import pandas as pd
import pathlib as path

## Importing Files

In [24]:
rel_path = path.Path().absolute()
#Importing the dataframe by every 1 million tuples because the full dataset is too large for one dataframe
try:
    n_1 = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t', nrows = 1000000)
    n_2 = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t',skiprows= range(1, 1000000), nrows = 1000000)
    n_3 = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t',skiprows= range(1, 2000000), nrows = 1000000)
    n_4 = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t',skiprows= range(1, 3000000), nrows = 1000000)
    n_5 = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t',skiprows= range(1, 4000000), nrows = 1000000)
    n_6 = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t',skiprows= range(1, 5000000), nrows = 1000000)
    n_7 = pd.read_csv(str(rel_path) + r"\raw_data\title_basics.tsv", sep='\t',skiprows= range(1, 6000000), nrows = 1000000)
    original_tuples = len(n_1) + len(n_2) + len(n_3) + len(n_4) + len(n_5) + len(n_6) + len(n_7)
    original_size = n_1.size + n_2.size + n_3.size + n_4.size + n_5.size + n_6.size + n_7.size
except:
    print("You're massing the title_basics.tsv file. Download it from the following link and unzip it to the raw_data folder. Then restart the kernel and rerun")
    print("https://datasets.imdbws.com/")

## Cleaning Data

### Removing all non-movie titles

In [25]:
n_1 = n_1[(n_1['titleType']== 'movie') | (n_1['titleType']== 'short')]
n_2 = n_2[(n_2['titleType']== 'movie') | (n_2['titleType']== 'short')]
n_3 = n_3[(n_3['titleType']== 'movie') | (n_3['titleType']== 'short')]
n_4 = n_4[(n_4['titleType']== 'movie') | (n_4['titleType']== 'short')]
n_5 = n_5[(n_5['titleType']== 'movie') | (n_5['titleType']== 'short')]
n_6 = n_6[(n_6['titleType']== 'movie') | (n_6['titleType']== 'short')]
n_7 = n_7[(n_7['titleType']== 'movie') | (n_7['titleType']== 'short')]

### Remerging original dataset

In [33]:
movie_titles = pd.concat([
    n_1,
    n_2,
    n_3,
    n_4,
    n_5,
    n_6,
    n_7
])

### Removing Adult Titles

In [34]:
movie_titles = movie_titles[movie_titles['isAdult'] == 0]

### Dropping extraneous columns

In [35]:
del movie_titles['endYear']
del movie_titles['isAdult']

### Replacing \N

In [36]:
movie_titles = movie_titles.copy()
movie_titles['startYear'] = pd.to_numeric(movie_titles['startYear'], errors = 'coerce')
movie_titles['runtimeMinutes'] = pd.to_numeric(movie_titles['runtimeMinutes'], errors = 'coerce')
movie_titles = movie_titles.fillna(0)

### Removing years before 1929

In [37]:
movie_titles = movie_titles[movie_titles['startYear'] >= 1929]

# Exporting Data

In [40]:
movie_titles.to_csv(str(rel_path) + r'\..\src\titles_during_oscars.csv')
new_tuples = len(movie_titles)
new_size = movie_titles.size

# Metrics

In [39]:
print("Original Tuple Count:\t"+str(original_tuples))
print("Cleansed Tuple Count:\t"+ str(new_tuples))
percent_tuple_reduction = (100 - round((100 * new_tuples) / original_tuples, 2))
print("Tuples Reduced By: \t"+ str(percent_tuple_reduction)+ "%")
percent_size_reduction = (100 - round((100 * new_size) / original_size, 2))
print('Size Reduced By: \t' + str(percent_size_reduction) + "%")

Original Tuple Count:	6561312
Cleansed Tuple Count:	1045419
Tuples Reduced By: 	84.07%
Size Reduced By: 	87.61%
