<a href="https://colab.research.google.com/github/PrathameshBawane/CODSOFT/blob/main/CODSOFT_TASK_1_Movie_Genre_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [82]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

# Loading train data
train_data = []
with open('/content/train_data.txt', 'r') as file:
    for line in file:
        train_data.append(line.strip().split(' ::: '))

train_df = pd.DataFrame(train_data, columns=['ID', 'Title', 'Genre', 'Description'])
train_df

Unnamed: 0,ID,Title,Genre,Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...
...,...,...,...,...
54209,54210,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on B...
54210,54211,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The siste...
54211,54212,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about gr..."
54212,54213,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and have...


In [83]:
# Loading test data
test_data = []
with open('test_data.txt', 'r') as file:
    for line in file:
        test_data.append(line.strip().split(' ::: '))

test_df = pd.DataFrame(test_data, columns=['ID', 'Title', 'Description'])
test_df

Unnamed: 0,ID,Title,Description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),Before he was known internationally as a marti...
...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Dar..."
54196,54197,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their ...
54197,54198,Oliver Twink (2007),"A movie 169 years in the making. Oliver Twist,..."
54198,54199,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard ..."


In [84]:
# Loading test data solution
test_solution_data = []
with open('test_data_solution.txt', 'r') as file:
    for line in file:
        test_solution_data.append(line.strip().split(' ::: '))

test_solution_df = pd.DataFrame(test_solution_data, columns=['ID', 'Title', 'Genre', 'Description'])
test_solution_df

Unnamed: 0,ID,Title,Genre,Description
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a marti...
...,...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)",horror,"Covering multiple genres, Tales of Light & Dar..."
54196,54197,Der letzte Mohikaner (1965),western,As Alice and Cora Munro attempt to find their ...
54197,54198,Oliver Twink (2007),adult,"A movie 169 years in the making. Oliver Twist,..."
54198,54199,Slipstream (1973),drama,"Popular, but mysterious rock D.J Mike Mallard ..."


In [85]:
train_df['Text'] = train_df['Title'] + " " + train_df['Description']
test_df['Text'] = test_df['Title'] + " " + test_df['Description']

In [86]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = vectorizer.fit_transform(train_df['Text'])
X_train

<54214x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 1909831 stored elements in Compressed Sparse Row format>

In [89]:
X_test = vectorizer.transform(test_df['Text'])
X_test

<54200x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 1908408 stored elements in Compressed Sparse Row format>

In [90]:
y_train = train_df['Genre']
y_train

0              drama
1           thriller
2              adult
3              drama
4              drama
            ...     
54209         comedy
54210         horror
54211    documentary
54212         comedy
54213        history
Name: Genre, Length: 54214, dtype: object

In [91]:
clf = LinearSVC()
clf

In [92]:
# Training the classifier
clf.fit(X_train, y_train)

# Predicting genres for test data
y_test_pred = clf.predict(X_test)
y_test_pred

array(['short', 'drama', 'documentary', ..., 'documentary', 'horror',
       'drama'], dtype=object)

In [93]:
# True labels from test solution
y_test_true = test_solution_df['Genre']
y_test_true

0           thriller
1             comedy
2        documentary
3              drama
4              drama
            ...     
54195         horror
54196        western
54197          adult
54198          drama
54199          drama
Name: Genre, Length: 54200, dtype: object

In [94]:
# Evaluate the model
accuracy = accuracy_score(y_test_true, y_test_pred)
accuracy

0.5792619926199262

In [95]:
report = classification_report(y_test_true, y_test_pred)
report

'              precision    recall  f1-score   support\n\n      action       0.39      0.32      0.35      1314\n       adult       0.52      0.38      0.44       590\n   adventure       0.41      0.22      0.29       775\n   animation       0.30      0.14      0.19       498\n   biography       0.06      0.00      0.01       264\n      comedy       0.54      0.56      0.55      7446\n       crime       0.21      0.07      0.11       505\n documentary       0.69      0.82      0.75     13096\n       drama       0.57      0.72      0.63     13612\n      family       0.32      0.13      0.19       783\n     fantasy       0.29      0.11      0.16       322\n   game-show       0.79      0.60      0.68       193\n     history       0.19      0.02      0.04       243\n      horror       0.59      0.61      0.60      2204\n       music       0.60      0.51      0.55       731\n     musical       0.25      0.08      0.12       276\n     mystery       0.29      0.07      0.11       318\n       

In [96]:
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.5792619926199262
Classification Report:
              precision    recall  f1-score   support

      action       0.39      0.32      0.35      1314
       adult       0.52      0.38      0.44       590
   adventure       0.41      0.22      0.29       775
   animation       0.30      0.14      0.19       498
   biography       0.06      0.00      0.01       264
      comedy       0.54      0.56      0.55      7446
       crime       0.21      0.07      0.11       505
 documentary       0.69      0.82      0.75     13096
       drama       0.57      0.72      0.63     13612
      family       0.32      0.13      0.19       783
     fantasy       0.29      0.11      0.16       322
   game-show       0.79      0.60      0.68       193
     history       0.19      0.02      0.04       243
      horror       0.59      0.61      0.60      2204
       music       0.60      0.51      0.55       731
     musical       0.25      0.08      0.12       276
     mystery       0.29      

In [97]:
test_predictions = clf.predict(X_test)
test_predictions

array(['short', 'drama', 'documentary', ..., 'documentary', 'horror',
       'drama'], dtype=object)

In [98]:
# Save the predictions to a file
test_df['Predicted_Genre'] = test_predictions
test_df.to_csv('/content/test_predictions.txt', index=False)

In [99]:
with open('test_predictions.csv', 'r') as file:
    data = file.read()
data = data.replace(',', ' ::: ')
with open('test_predictions.txt', 'w') as file:
    file.write(data)
    data