# Converting .tsv files to .csv 

In [18]:
import pandas as pd

In [19]:
# File paths
input_file = '/Users/hunterbecker/Desktop/BootcampRepos/IMDB/Resources/TSV/title.ratings.tsv'
# output_file = '/Users/hunterbecker/Desktop/BootcampRepos/IMDB/Resources/CSV/title.ratings.csv'

# Read the TSV data into a DataFrame
ratings_df = pd.read_csv(input_file, sep='\t', low_memory=False)  # Read as TSV

# # Save to a CSV file
# ratings_df.to_csv(output_file, index=False)  # Save as CSV without the index

In [20]:
# File paths
input_file = '/Users/hunterbecker/Desktop/BootcampRepos/IMDB/Resources/TSV/title.basics.tsv'
output_file = '/Users/hunterbecker/Desktop/BootcampRepos/IMDB/Resources/CSV/title.basics.csv'

# Read the TSV file, replace '\N' with NaN, and handle bad lines
basics_df = pd.read_csv(input_file, sep='\t', na_values=['\\N'], low_memory=False)

# Save the processed DataFrame to a new CSV file
basics_df.to_csv(output_file, index=False)


In [21]:
basics_df1 = basics_df.loc[basics_df['isAdult'] == 0]

In [22]:
del basics_df1['isAdult']

In [23]:
basics_df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10778866 entries, 0 to 11136778
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   tconst          object 
 1   titleType       object 
 2   primaryTitle    object 
 3   originalTitle   object 
 4   startYear       float64
 5   endYear         float64
 6   runtimeMinutes  object 
 7   genres          object 
dtypes: float64(2), object(6)
memory usage: 740.1+ MB


In [24]:
basics_df2 = basics_df1.loc[basics_df1['startYear'] >= 1939]

In [25]:
basics_df2 = basics_df2.astype({'startYear':int}, errors='raise')

In [26]:
basics_df3 = basics_df2.loc[basics_df2['titleType'] == 'movie']

In [27]:
del basics_df3['titleType']

In [28]:
del basics_df3['endYear']

In [29]:
basics_df4 = basics_df3.dropna(how = 'any')

In [30]:
del basics_df4['originalTitle']

In [31]:
basics_df5 = basics_df4.astype({'runtimeMinutes':int}, errors='raise')

In [32]:
basics_df5.head()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres
13077,tt0013274,Istoriya grazhdanskoy voyny,2021,94,Documentary
15480,tt0015724,Dama de noche,1993,102,"Drama,Mystery,Romance"
18588,tt0018867,Escape from Hong Kong,1942,60,"Adventure,Mystery,War"
21267,tt0021617,Arizona Territory,1950,56,Western
21704,tt0022064,Lebbra bianca,1951,80,Drama


In [33]:
# basics_df.head()
genres = basics_df5['genres'].str.split(',').explode().unique()
genres

array(['Documentary', 'Drama', 'Mystery', 'Romance', 'Adventure', 'War',
       'Western', 'Musical', 'Comedy', 'Thriller', 'Crime', 'Film-Noir',
       'History', 'Biography', 'Fantasy', 'Action', 'Sport', 'Family',
       'Music', 'Horror', 'Animation', 'Sci-Fi', 'News', 'Talk-Show',
       'Reality-TV', 'Game-Show', 'Adult'], dtype=object)

In [34]:
for genre in genres:
    basics_df5[genre] = basics_df5['genres'].str.contains(genre).astype(int)

In [35]:
del basics_df5['genres']

In [36]:
basics_df6 = basics_df5.rename(columns={'Film-Noir':'FilmNoir','Sci-Fi':'SciFi','Talk-Show':'TalkShow','Reality-TV':'RealityTV','Game-Show':'GameShow'})

In [37]:
def remove_single_quotes(text):
    if isinstance(text, str):
        return text.replace("'", "")  # Remove single quotes
    return text
# Apply the function to remove single quotes
basics_df6['primaryTitle'] = basics_df6['primaryTitle'].apply(remove_single_quotes)

In [40]:
ratings_df1 = ratings_df[ratings_df['tconst'].isin(basics_df6['tconst'])]

In [41]:
basics_df7 = basics_df6[basics_df6['tconst'].isin(ratings_df1['tconst'])]

In [42]:
output_file = '/Users/hunterbecker/Desktop/BootcampRepos/IMDB/Resources/CSV/title.basics_clean.csv'

# Save the processed DataFrame to a new CSV file
basics_df7.to_csv(output_file, index=False)

In [43]:
output_file = '/Users/hunterbecker/Desktop/BootcampRepos/IMDB/Resources/CSV/title.ratings_clean.csv'

# Save the processed DataFrame to a new CSV file
ratings_df1.to_csv(output_file, index=False)

# Enter data into postgres using the table schema found in the repo 

In [2]:
from sqlalchemy import create_engine
import psycopg2
import pandas as pd
from config import engine_key

engine = create_engine('postgresql+psycopg2://{engine_key}')
conn = engine.connect()


In [3]:
# Query All Records in the the Database
title_basics = pd.read_sql("SELECT * FROM title_basics", conn)

In [4]:
# Query All Records in the the Database
title_ratings = pd.read_sql("SELECT * FROM title_ratings", conn)

In [5]:
#merge ratings and basics dataframes
imdb_df = pd.merge(title_basics,title_ratings,on='tconst')

In [6]:
# Reordering columns 
new_imdb_df = imdb_df[['tconst', 'primarytitle', 'startyear', 'runtimeminutes','averagerating','numvotes', 'documentary',
       'drama', 'mystery', 'romance', 'adventure', 'war', 'western', 'musical',
       'comedy', 'thriller', 'crime', 'filmnoir', 'history', 'biography',
       'fantasy', 'action', 'sport', 'family', 'music', 'horror', 'animation',
       'scifi', 'news', 'talkshow', 'realitytv', 'gameshow', 'adult']]

In [7]:
new_imdb_df.head()

Unnamed: 0,tconst,primarytitle,startyear,runtimeminutes,averagerating,numvotes,documentary,drama,mystery,romance,...,family,music,horror,animation,scifi,news,talkshow,realitytv,gameshow,adult
0,tt0013274,Istoriya grazhdanskoy voyny,2021,94,6.7,74,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,tt0015724,Dama de noche,1993,102,6.3,31,False,True,True,True,...,False,False,False,False,False,False,False,False,False,False
2,tt0018867,Escape from Hong Kong,1942,60,5.6,46,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,tt0021617,Arizona Territory,1950,56,6.1,62,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,tt0022064,Lebbra bianca,1951,80,5.2,62,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
# Function to update the Treeview based on filter
def update_treeview(filter_column=None, filter_value=None, sort_order=None):
    for row in treeview.get_children():
        treeview.delete(row)  # Clear existing rows
    
    filtered_df = new_imdb_df.copy()
    
    # Apply filter if specified
    if filter_column and filter_value:
        filtered_df = filtered_df[filtered_df[filter_column].astype(str).str.contains(filter_value, case=False, na=False)]
    
    # Apply sorting if specified
    if sort_order:
        filtered_df = filtered_df.sort_values(by=filter_column, ascending=(sort_order == 'Ascending'))
    
    for _, row in filtered_df.iterrows():
        treeview.insert('', 'end', values=list(row))

# Set up the GUI
root = tk.Tk()
root.title("Basics DataFrame Viewer")

# Create a frame for the filter
filter_frame = ttk.Frame(root)
filter_frame.pack(pady=10)

# Column to filter
filter_label = tk.Label(filter_frame, text="Filter Column:")
filter_label.grid(row=0, column=0)

filter_column = ttk.Combobox(filter_frame, values=new_imdb_df.columns.tolist())
filter_column.grid(row=0, column=1)

# Entry for filter value
filter_value_label = tk.Label(filter_frame, text="Filter Value:")
filter_value_label.grid(row=0, column=2)

filter_value = tk.Entry(filter_frame)
filter_value.grid(row=0, column=3)

# Button to apply filter
filter_button = tk.Button(filter_frame, text="Apply Filter", command=lambda: update_treeview(filter_column.get(), filter_value.get(), None))
filter_button.grid(row=0, column=4)

# Sort buttons
sort_frame = ttk.Frame(root)
sort_frame.pack(pady=10)

sort_label = tk.Label(sort_frame, text="Sort By:")
sort_label.grid(row=0, column=0)

sort_column = ttk.Combobox(sort_frame, values=new_imdb_df.columns.tolist())
sort_column.grid(row=0, column=1)

ascending_button = tk.Button(sort_frame, text="Sort Ascending", command=lambda: update_treeview(sort_column.get(), None, 'Ascending'))
ascending_button.grid(row=0, column=2)

descending_button = tk.Button(sort_frame, text="Sort Descending", command=lambda: update_treeview(sort_column.get(), None, 'Descending'))
descending_button.grid(row=0, column=3)

# Create the Treeview
treeview = ttk.Treeview(root, columns=list(new_imdb_df.columns), show='headings')
treeview.pack(fill='both', expand=True)

# Set column headings and widths
for column in new_imdb_df.columns:
    treeview.heading(column, text=column)
    treeview.column(column, anchor='center', width=100)  # Adjust width here

# Populate the Treeview with the initial data
update_treeview()  # Show all data initially

# Run the application
root.mainloop()

KeyboardInterrupt: 