# Converting .tsv files to .csv 

In [18]:
import pandas as pd

In [19]:
# File paths
input_file = '/Users/hunterbecker/Desktop/BootcampRepos/IMDB/Resources/TSV/title.ratings.tsv'
# output_file = '/Users/hunterbecker/Desktop/BootcampRepos/IMDB/Resources/CSV/title.ratings.csv'

# Read the TSV data into a DataFrame
ratings_df = pd.read_csv(input_file, sep='\t', low_memory=False)  # Read as TSV

# # Save to a CSV file
# ratings_df.to_csv(output_file, index=False)  # Save as CSV without the index

In [20]:
# File paths
input_file = '/Users/hunterbecker/Desktop/BootcampRepos/IMDB/Resources/TSV/title.basics.tsv'
output_file = '/Users/hunterbecker/Desktop/BootcampRepos/IMDB/Resources/CSV/title.basics.csv'

# Read the TSV file, replace '\N' with NaN, and handle bad lines
basics_df = pd.read_csv(input_file, sep='\t', na_values=['\\N'], low_memory=False)

# Save the processed DataFrame to a new CSV file
basics_df.to_csv(output_file, index=False)


In [21]:
basics_df1 = basics_df.loc[basics_df['isAdult'] == 0]

In [22]:
del basics_df1['isAdult']

In [23]:
basics_df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10778866 entries, 0 to 11136778
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   tconst          object 
 1   titleType       object 
 2   primaryTitle    object 
 3   originalTitle   object 
 4   startYear       float64
 5   endYear         float64
 6   runtimeMinutes  object 
 7   genres          object 
dtypes: float64(2), object(6)
memory usage: 740.1+ MB


In [24]:
basics_df2 = basics_df1.loc[basics_df1['startYear'] >= 1939]

In [25]:
basics_df2 = basics_df2.astype({'startYear':int}, errors='raise')

In [26]:
basics_df3 = basics_df2.loc[basics_df2['titleType'] == 'movie']

In [27]:
del basics_df3['titleType']

In [28]:
del basics_df3['endYear']

In [29]:
basics_df4 = basics_df3.dropna(how = 'any')

In [30]:
del basics_df4['originalTitle']

In [31]:
basics_df5 = basics_df4.astype({'runtimeMinutes':int}, errors='raise')

In [32]:
basics_df5.head()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres
13077,tt0013274,Istoriya grazhdanskoy voyny,2021,94,Documentary
15480,tt0015724,Dama de noche,1993,102,"Drama,Mystery,Romance"
18588,tt0018867,Escape from Hong Kong,1942,60,"Adventure,Mystery,War"
21267,tt0021617,Arizona Territory,1950,56,Western
21704,tt0022064,Lebbra bianca,1951,80,Drama


In [33]:
# basics_df.head()
genres = basics_df5['genres'].str.split(',').explode().unique()
genres

array(['Documentary', 'Drama', 'Mystery', 'Romance', 'Adventure', 'War',
       'Western', 'Musical', 'Comedy', 'Thriller', 'Crime', 'Film-Noir',
       'History', 'Biography', 'Fantasy', 'Action', 'Sport', 'Family',
       'Music', 'Horror', 'Animation', 'Sci-Fi', 'News', 'Talk-Show',
       'Reality-TV', 'Game-Show', 'Adult'], dtype=object)

In [34]:
for genre in genres:
    basics_df5[genre] = basics_df5['genres'].str.contains(genre).astype(int)

In [35]:
del basics_df5['genres']

In [36]:
basics_df6 = basics_df5.rename(columns={'Film-Noir':'FilmNoir','Sci-Fi':'SciFi','Talk-Show':'TalkShow','Reality-TV':'RealityTV','Game-Show':'GameShow'})

In [37]:
def remove_single_quotes(text):
    if isinstance(text, str):
        return text.replace("'", "")  # Remove single quotes
    return text
# Apply the function to remove single quotes
basics_df6['primaryTitle'] = basics_df6['primaryTitle'].apply(remove_single_quotes)

In [40]:
ratings_df1 = ratings_df[ratings_df['tconst'].isin(basics_df6['tconst'])]

In [41]:
basics_df7 = basics_df6[basics_df6['tconst'].isin(ratings_df1['tconst'])]

In [42]:
output_file = '/Users/hunterbecker/Desktop/BootcampRepos/IMDB/Resources/CSV/title.basics_clean.csv'

# Save the processed DataFrame to a new CSV file
basics_df7.to_csv(output_file, index=False)

In [43]:
output_file = '/Users/hunterbecker/Desktop/BootcampRepos/IMDB/Resources/CSV/title.ratings_clean.csv'

# Save the processed DataFrame to a new CSV file
ratings_df1.to_csv(output_file, index=False)

# Enter data into postgres using the table schema found in the repo 

In [3]:
from sqlalchemy import create_engine
import psycopg2
import pandas as pd


engine = create_engine('postgresql+psycopg2://postgres:postgres@localhost/IMDB')
conn = engine.connect()


In [6]:
# Query All Records in the the Database
title_basics = pd.read_sql("SELECT * FROM title_basics", conn)

In [7]:
# Query All Records in the the Database
title_ratings = pd.read_sql("SELECT * FROM title_ratings", conn)

In [8]:
imdb_df = pd.merge(title_basics,title_ratings,on='tconst')

In [9]:
imdb_df.head()

Unnamed: 0,tconst,titletype,primarytitle,startyear,runtimeminutes,documentary,drama,mystery,romance,adventure,...,horror,animation,scifi,news,talkshow,realitytv,gameshow,adult,averagerating,numvotes
0,tt0013274,movie,Istoriya grazhdanskoy voyny,2021,94,True,False,False,False,False,...,False,False,False,False,False,False,False,False,6.7,74
1,tt0015724,movie,Dama de noche,1993,102,False,True,True,True,False,...,False,False,False,False,False,False,False,False,6.3,31
2,tt0018867,movie,Escape from Hong Kong,1942,60,False,False,True,False,True,...,False,False,False,False,False,False,False,False,5.6,46
3,tt0021617,movie,Arizona Territory,1950,56,False,False,False,False,False,...,False,False,False,False,False,False,False,False,6.1,62
4,tt0022064,movie,Lebbra bianca,1951,80,False,True,False,False,False,...,False,False,False,False,False,False,False,False,5.2,62
