In [1]:
"""
Imports:
    pandas as pd: This import statement brings in the pandas library, which is a powerful data manipulation and analysis tool for Python.

Usage:
    This script likely uses pandas to read, manipulate, and analyze data related to the YTS dataset.

Note:
    Ensure that the pandas library is installed in your Python environment before running this script.
"""
import pandas as pd


In [2]:
"""
This script reads two CSV files, 'dataset_1.csv' and 'dataset_2.csv', into pandas DataFrames.

Variables:
    df1 (pd.DataFrame): DataFrame containing data from 'dataset_1.csv'.
    df2 (pd.DataFrame): DataFrame containing data from 'dataset_2.csv'.
"""

df1 = pd.read_csv('Original_Dataset_1.csv')
df2 = pd.read_csv('Original_Dataset_2.csv')

In [3]:
# Combine the two dataframes
combined_df = pd.concat([df1, df2])

# Drop duplicate rows based on the 'URL' column
combined_df = combined_df.drop_duplicates(subset='URL')


In [4]:
# Drop the 'key' column
combined_df = combined_df.drop(columns=['key'])

In [5]:
# Split the "Genres" column into a list based on " / " delimiter
combined_df['GenreList'] = combined_df['Genres'].str.split(' / ')

In [6]:
# Get all unique genres across the entire dataset
all_genres = sorted(set(genre for sublist in combined_df['GenreList'] for genre in sublist))


In [7]:
# Create a temporary DataFrame to store genre columns
genre_columns = pd.DataFrame()

# Populate the genre columns with 1s and 0s for each genre
for genre in all_genres:
    genre_columns[genre] = combined_df['GenreList'].apply(lambda x: 1 if genre in x else 0)

# Append genre columns to the original DataFrame
combined_df = pd.concat([combined_df, genre_columns], axis=1)

# Drop the temporary 'GenreList' column from the DataFrame
combined_df = combined_df.drop(columns=['GenreList'])

In [8]:
# Define the desired order of columns
desired_order = ['Movie_Title', 'Movie_Director', 'Released_Year', 'Runtime', 'IMDb_Rating', 'IMDb_Votes', 'YTS_Likes', 'Uploader', 'Uploaded_Time', 'URL', 'Seeds','Genres'] + all_genres

In [9]:
# Rearrange the columns in the DataFrame
combined_df = combined_df.reindex(columns=desired_order)

In [10]:
# Check for missing values in the DataFrame
combined_df.notnull().sum()

Movie_Title       12149
Movie_Director    12145
Released_Year     12149
Runtime           12149
IMDb_Rating       12144
IMDb_Votes        12144
YTS_Likes         12065
Uploader          12149
Uploaded_Time     12132
URL               12149
Seeds              1051
Genres            12149
Action            12149
Adventure         12149
Animation         12149
Biography         12149
Comedy            12149
Crime             12149
Documentary       12149
Drama             12149
Family            12149
Fantasy           12149
Film-Noir         12149
Game-Show         12149
History           12149
Horror            12149
Music             12149
Musical           12149
Mystery           12149
News              12149
Reality-TV        12149
Romance           12149
Sci-Fi            12149
Sport             12149
Talk-Show         12149
Thriller          12149
War               12149
Western           12149
dtype: int64

In [11]:
# Fill missing values with 0
combined_df = combined_df.fillna(0)

In [12]:
# Sort the DataFrame by 'Movie_Title' in ascending order
combined_df = combined_df.sort_values(by='Movie_Title', ascending=True)

In [13]:
# Save the combined DataFrame to a new CSV file
combined_df.to_csv('Combined_Dataset.csv', index=False)