<a href="https://colab.research.google.com/github/PrathameshBawane/CODSOFT/blob/main/CODSOFT_TASK_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

description_file_path = r'/content/description.txt'
test_solution_file_path = r'/content/test_data_solution.txt'

# Loading the data from description.txt
data = pd.read_csv(description_file_path, sep=' ::: ', engine='python', header=None, names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])
data

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,Train data:,,,
1,ID,TITLE,GENRE,DESCRIPTION
2,ID,TITLE,GENRE,DESCRIPTION
3,ID,TITLE,GENRE,DESCRIPTION
4,ID,TITLE,GENRE,DESCRIPTION
5,Test data:,,,
6,ID,TITLE,DESCRIPTION,
7,ID,TITLE,DESCRIPTION,
8,ID,TITLE,DESCRIPTION,
9,ID,TITLE,DESCRIPTION,


In [None]:
# Spliting the data into train and test sets based on the presence of the GENRE column
train_data = data.dropna(subset=['GENRE'])
test_data = data[data['GENRE'].isna()].drop(columns=['GENRE'])

# Loading the test solution data
test_solution = pd.read_csv(test_solution_file_path, sep=' ::: ', engine='python', names=['ID', 'TITLE', 'GENRE'])
test_solution

Unnamed: 0,ID,TITLE,GENRE
1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apart..."
2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty chi..."
3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family o...
4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with his..."
5,Er nu zhai (1955),drama,Before he was known internationally as a marti...
...,...,...,...
54196,"""Tales of Light & Dark"" (2013)",horror,"Covering multiple genres, Tales of Light & Dar..."
54197,Der letzte Mohikaner (1965),western,As Alice and Cora Munro attempt to find their ...
54198,Oliver Twink (2007),adult,"A movie 169 years in the making. Oliver Twist,..."
54199,Slipstream (1973),drama,"Popular, but mysterious rock D.J Mike Mallard ..."


In [None]:
# Checking for missing values in DESCRIPTION and fill them with an empty string
train_data['DESCRIPTION'] = train_data['DESCRIPTION'].fillna('')
test_data['DESCRIPTION'] = test_data['DESCRIPTION'].fillna('')

# Checking the first few rows of the datasets
print("Train Data:")
print(train_data.head())
print("\nTest Data:")
print(test_data.head())
print("\nTest Solution Data:")
print(test_solution.head())

# Printing the shapes of the datasets to check for consistency
print("\nShapes of datasets:")
print("Train Data Shape:", train_data.shape)
print("Test Data Shape:", test_data.shape)
print("Test Solution Data Shape:", test_solution.shape)

# Creating a TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf

Train Data:
   ID  TITLE        GENRE  DESCRIPTION
1  ID  TITLE        GENRE  DESCRIPTION
2  ID  TITLE        GENRE  DESCRIPTION
3  ID  TITLE        GENRE  DESCRIPTION
4  ID  TITLE        GENRE  DESCRIPTION
6  ID  TITLE  DESCRIPTION             

Test Data:
                                                  ID TITLE DESCRIPTION
0                                        Train data:  None            
5                                         Test data:  None            
10                                           Source:  None            
11  ftp://ftp.fu-berlin.de/pub/misc/movies/database/  None            

Test Solution Data:
                            ID        TITLE  \
1         Edgar's Lunch (1998)     thriller   
2     La guerra de papá (1977)       comedy   
3  Off the Beaten Track (2010)  documentary   
4       Meu Amigo Hindu (2015)        drama   
5            Er nu zhai (1955)        drama   

                                               GENRE  
1  L.R. Brane loves his life

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['DESCRIPTION'] = train_data['DESCRIPTION'].fillna('')


In [None]:


# Creating a pipeline with TF-IDF and Logistic Regression
model = Pipeline([
    ('tfidf', tfidf),
    ('clf', LogisticRegression(max_iter=1000))
])
model

In [None]:
# Spliting the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data['DESCRIPTION'], train_data['GENRE'], test_size=0.2, random_state=42)

# Training the model
model.fit(X_train, y_train)

# Validating the model
val_predictions = model.predict(X_val)
val_predictions

array(['GENRE', 'DESCRIPTION'], dtype=object)

In [None]:
print("\nValidation Accuracy:", accuracy_score(y_val, val_predictions))
print("Validation Classification Report:\n", classification_report(y_val, val_predictions))

test_predictions = model.predict(test_data['DESCRIPTION'])
test_predictions


Validation Accuracy: 1.0
Validation Classification Report:
               precision    recall  f1-score   support

 DESCRIPTION       1.00      1.00      1.00         1
       GENRE       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



array(['DESCRIPTION', 'DESCRIPTION', 'DESCRIPTION', 'DESCRIPTION'],
      dtype=object)

In [None]:
print("\nTest Predictions Shape:", test_predictions.shape)

# Evaluating on test data using the provided solution
if len(test_solution) == len(test_predictions):
    test_accuracy = accuracy_score(test_solution['GENRE'], test_predictions)
    print("\nTest Accuracy:", test_accuracy)
    print("Test Classification Report:\n", classification_report(test_solution['GENRE'], test_predictions))
else:
    print("Mismatch in number of samples between test data and test solution.")


Test Predictions Shape: (4,)
Mismatch in number of samples between test data and test solution.
