In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

In [2]:
def load_train_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(' ::: ')
            data.append((parts[3], parts[2].split(', ')))
    return data

train_data = load_train_data('train_data.txt')

In [3]:
# Preprocess train data
train_texts, train_labels = zip(*train_data)

In [4]:
# Feature extraction
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train = vectorizer.fit_transform(train_texts)

In [5]:
# Convert labels to binary form
mlb = MultiLabelBinarizer()
Y_train = mlb.fit_transform(train_labels)

# Train the model
classifier = OneVsRestClassifier(LinearSVC())
classifier.fit(X_train, Y_train)

In [8]:
# Load test data
def load_test_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(' ::: ')
            data.append((parts[0], parts[2]))
    return data

test_data = load_test_data('test_data.txt')

In [9]:
# Preprocess test data
test_ids, test_texts = zip(*test_data)

In [10]:
# Feature extraction for test data
X_test = vectorizer.transform(test_texts)

In [11]:
# Prediction
predictions = classifier.predict(X_test)
predicted_labels = mlb.inverse_transform(predictions)

# Combine test IDs with predicted genres
predicted_data = list(zip(test_ids, predicted_labels))

# Output predictions
for movie_id, genres in predicted_data:
    print(f"Movie ID: {movie_id}, Predicted Genres: {', '.join(genres)}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Movie ID: 39165, Predicted Genres: documentary
Movie ID: 39166, Predicted Genres: 
Movie ID: 39167, Predicted Genres: music
Movie ID: 39168, Predicted Genres: documentary
Movie ID: 39169, Predicted Genres: drama
Movie ID: 39170, Predicted Genres: drama
Movie ID: 39171, Predicted Genres: documentary
Movie ID: 39172, Predicted Genres: 
Movie ID: 39173, Predicted Genres: 
Movie ID: 39174, Predicted Genres: documentary
Movie ID: 39175, Predicted Genres: 
Movie ID: 39176, Predicted Genres: documentary
Movie ID: 39177, Predicted Genres: 
Movie ID: 39178, Predicted Genres: 
Movie ID: 39179, Predicted Genres: 
Movie ID: 39180, Predicted Genres: drama
Movie ID: 39181, Predicted Genres: 
Movie ID: 39182, Predicted Genres: documentary
Movie ID: 39183, Predicted Genres: 
Movie ID: 39184, Predicted Genres: comedy
Movie ID: 39185, Predicted Genres: 
Movie ID: 39186, Predicted Genres: 
Movie ID: 39187, Predicted Genres: drama
Movie ID: 

In [12]:
import csv

# Define the file path for the CSV output
output_file = "predictions.csv"

# Open the CSV file in write mode and write the predictions
with open(output_file, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["ID", "Genres"])
    # Write the predictions
    for movie_id, genres in predicted_data:
        writer.writerow([movie_id, ', '.join(genres)])

print("Predictions exported to", output_file)

Predictions exported to predictions.csv


In [13]:
from sklearn.metrics import accuracy_score

# Load the ground truth labels for the test data
def load_ground_truth(file_path):
    labels = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(' ::: ')
            labels[parts[0]] = parts[2].split(', ')
    return labels

ground_truth = load_ground_truth('test_data.txt')

# Convert ground truth labels to binary form
Y_test = mlb.transform([ground_truth[movie_id] for movie_id, _ in test_data])

# Calculate accuracy
accuracy = accuracy_score(Y_test, predictions)
print("Accuracy:", accuracy)


Accuracy: 0.5370211031609455


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [14]:
from sklearn.model_selection import cross_val_score, train_test_split

# Split the training data into train and validation sets
X_train_split, X_val_split, Y_train_split, Y_val_split = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

# Train the model on the training split
classifier.fit(X_train_split, Y_train_split)

# Perform cross-validation
cv_scores = cross_val_score(classifier, X_train, Y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# Evaluate on the validation set
val_predictions = classifier.predict(X_val_split)
val_accuracy = accuracy_score(Y_val_split, val_predictions)
print("Validation set accuracy:", val_accuracy)

# Evaluate on the test set if ground truth labels are available
if ground_truth:
    test_accuracy = accuracy_score(Y_test, predictions)
    print("Test set accuracy:", test_accuracy)
else:
    print("Ground truth labels for the test set are not available. Cannot evaluate on test set.")

Cross-validation scores: [0.3230645  0.32387216 0.3280637  0.32690976 0.33429495]
Mean CV accuracy: 0.3272410123088788
Validation set accuracy: 0.32594900196146304
Test set accuracy: 0.5370211031609455


In [15]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters grid
param_grid = {'estimator__C': [0.001, 0.01, 0.1, 1, 10]}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=OneVsRestClassifier(LinearSVC()), param_grid=param_grid, cv=5)

# Perform grid search
grid_search.fit(X_train, Y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)

# Use the best model for prediction
best_classifier = grid_search.best_estimator_
predictions = best_classifier.predict(X_test)

Best hyperparameters: {'estimator__C': 10}


In [16]:
# Export the predictions into a CSV file
output_file = "predictions1.csv"

with open(output_file, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["ID", "Genres"])
    for movie_id, genres in zip(test_ids, predicted_labels):
        writer.writerow([movie_id, ', '.join(genres)])

print("Predictions exported to", output_file)

Predictions exported to predictions1.csv
