In [1]:
import numpy as np 
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [9]:
train = pd.read_csv("./GenreClassificationDataset/train_data.txt", delimiter='\t')

In [2]:


# Read the .txt file and split lines by ':::'
with open('./GenreClassificationDataset/train_data.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

data = [line.strip().split(' ::: ') for line in lines]

# Create DataFrame with appropriate column names
df = pd.DataFrame(data, columns=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])

# Optionally, convert ID column to integer type
df['ID'] = df['ID'].astype(int)



In [3]:

# Display the DataFrame
df.head()

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [5]:
print(df.dtypes)
existing_genres = df['GENRE'].unique()
print(existing_genres)

ID              int32
TITLE          object
GENRE          object
DESCRIPTION    object
dtype: object
['drama' 'thriller' 'adult' 'documentary' 'comedy' 'crime' 'reality-tv'
 'horror' 'sport' 'animation' 'action' 'fantasy' 'short' 'sci-fi' 'music'
 'adventure' 'talk-show' 'western' 'family' 'mystery' 'history' 'news'
 'biography' 'romance' 'game-show' 'musical' 'war']


In [6]:
print(df.isnull().sum())

ID             0
TITLE          0
GENRE          0
DESCRIPTION    0
dtype: int64


In [7]:
df.drop(columns=['TITLE'], inplace=True)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [9]:

# Read the .txt file and split lines by ':::'
with open('./GenreClassificationDataset/test_data.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

data2 = [line.strip().split(' ::: ') for line in lines]

# Create DataFrame with appropriate column names
test_df = pd.DataFrame(data2, columns=['ID', 'TITLE', 'DESCRIPTION'])

# Optionally, convert ID column to integer type
test_df['ID'] = test_df['ID'].astype(int)



In [13]:
test_df.head()


Unnamed: 0,ID,TITLE,DESCRIPTION
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),Before he was known internationally as a marti...


In [14]:
df.head()

Unnamed: 0,ID,GENRE,DESCRIPTION
0,1,drama,Listening in to a conversation between his doc...
1,2,thriller,A brother and sister with a past incestuous re...
2,3,adult,As the bus empties the students for their fiel...
3,4,drama,To help their unemployed father make ends meet...
4,5,drama,The film's title refers not only to the un-rec...


In [15]:
# Concatenate the 'DESCRIPTION' columns of df and test_df
new_df = pd.concat([df['DESCRIPTION'], test_df['DESCRIPTION']], ignore_index=True)

# Create a new DataFrame with the concatenated 'DESCRIPTION' column
new_df = pd.DataFrame({'DESCRIPTION': new_df})

In [17]:
new_df.head()

Unnamed: 0,DESCRIPTION
0,Listening in to a conversation between his doc...
1,A brother and sister with a past incestuous re...
2,As the bus empties the students for their fiel...
3,To help their unemployed father make ends meet...
4,The film's title refers not only to the un-rec...


In [18]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['DESCRIPTION'])
y = df['GENRE']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)



In [19]:
# Predict genre labels for the test set
y_pred = model.predict(X_test)



In [20]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5900581020012912


In [22]:

# Read the .txt file and split lines by ':::'
with open('./GenreClassificationDataset/test_data.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

data2 = [line.strip().split(' ::: ') for line in lines]

# Create DataFrame with appropriate column names
test_df = pd.DataFrame(data2, columns=['ID', 'TITLE', 'DESCRIPTION'])

# Optionally, convert ID column to integer type
test_df['ID'] = test_df['ID'].astype(int)



In [27]:
sub = pd.DataFrame(data2, columns=['ID', 'TITLE', 'DESCRIPTION'])



In [34]:
test_df.drop(columns=['ID'], inplace=True)

In [26]:
test_df.head()

Unnamed: 0,ID,TITLE,DESCRIPTION
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),Before he was known internationally as a marti...


In [23]:
X_new = tfidf.fit_transform(test_df['DESCRIPTION'])

In [24]:
ytest_pred = model.predict(X_new)

In [25]:
ytest_pred


array(['drama', 'drama', 'drama', ..., 'short', 'drama', 'drama'],
      dtype=object)

In [31]:
predictions_df = pd.DataFrame({'GENRE': ytest_pred})

# Add a new column 'ID'
predictions_df['ID'] = sub['ID']
predictions_df['TITLE'] = sub['TITLE']
predictions_df['DESCRIPTION'] = sub['DESCRIPTION']



In [38]:
#predictions_df = predictions_df[['TITLE', 'GENRE']]
predictions_df = predictions_df[['ID', 'TITLE', 'GENRE', 'DESCRIPTION']]
predictions_df

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Edgar's Lunch (1998),drama,"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),drama,"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),drama,One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),documentary,"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),documentary,Before he was known internationally as a marti...
...,...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)",drama,"Covering multiple genres, Tales of Light & Dar..."
54196,54197,Der letzte Mohikaner (1965),comedy,As Alice and Cora Munro attempt to find their ...
54197,54198,Oliver Twink (2007),short,"A movie 169 years in the making. Oliver Twist,..."
54198,54199,Slipstream (1973),drama,"Popular, but mysterious rock D.J Mike Mallard ..."


In [39]:
# Export the DataFrame to a new file
predictions_df.to_csv('predictions_with_ids.csv', index=False)

print("DataFrame with IDs and predictions exported successfully.")

DataFrame with IDs and predictions exported successfully.


Calculating the score


In [37]:
SOL_df

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a marti...
...,...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)",horror,"Covering multiple genres, Tales of Light & Dar..."
54196,54197,Der letzte Mohikaner (1965),western,As Alice and Cora Munro attempt to find their ...
54197,54198,Oliver Twink (2007),adult,"A movie 169 years in the making. Oliver Twist,..."
54198,54199,Slipstream (1973),drama,"Popular, but mysterious rock D.J Mike Mallard ..."


In [40]:
#Calculating the accuracy
from sklearn.metrics import accuracy_score

# Assuming df_ground_truth is the ground truth DataFrame and df_predicted is the predicted DataFrame

# Ensure both DataFrames have the same length
if len(SOL_df) != len(predictions_df):
    raise ValueError("Both DataFrames must have the same length.")

# Calculate accuracy
accuracy = accuracy_score(SOL_df['GENRE'], predictions_df['GENRE'])

print("Accuracy:", accuracy)
#I got an accuracy of 0.28

Accuracy: 0.28284132841328413


Importing the solution for comparing


In [35]:
# Read the .txt file and split lines by ':::'
with open('./GenreClassificationDataset/test_data_solution.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

datasol = [line.strip().split(' ::: ') for line in lines]

# Create DataFrame with appropriate column names
SOL_df = pd.DataFrame(datasol, columns=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])

# Optionally, convert ID column to integer type
SOL_df['ID'] = SOL_df['ID'].astype(int)



In [9]:
SOL_df.head()

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a marti...
