In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

train_df = pd.read_csv("checkpoint_train.csv")
test_df = pd.read_csv("checkpoint_test.csv")    

text_columns = ['overview', 'tagline', 'title', 'all_keywords']

num_train = train_df.drop(columns = text_columns)
num_test = test_df.drop(columns = text_columns)

train_df[text_columns] = train_df[text_columns].fillna("").astype(str)
test_df[text_columns] = test_df[text_columns].fillna("").astype(str)

vectorizer = TfidfVectorizer()
vectorizer.fit(train_df['all_keywords'])
tfidf_train_matrix = vectorizer.transform(train_df['all_keywords'])
tfidf_test_matrix = vectorizer.transform(test_df['all_keywords'])

svd_model = TruncatedSVD(n_components = 1)
svd_matrix_train = svd_model.fit_transform(tfidf_train_matrix)
svd_matrix_test = svd_model.fit_transform(tfidf_test_matrix)

train_df = pd.merge(num_train, pd.DataFrame(svd_matrix_train), left_index = True, right_index = True)
test_df = pd.merge(num_test, pd.DataFrame(svd_matrix_test), left_index = True, right_index = True)

X_train = train_df.drop(labels = 'rating', axis = 1)
y_train = train_df['rating']

X_test = test_df

# Scale features to the range [0, 1]
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Use KMeans clustering
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X_train, y_train)

# Assign cluster labels to the test data
test_df['cluster'] = kmeans.predict(X_test)

# Calculate mean rating for each cluster in the training data
cluster_means = train_df.groupby(kmeans.labels_)['rating'].mean()

# Assign predicted ratings based on the cluster means
test_df['rating'] = test_df['cluster'].apply(lambda x: cluster_means[x])

test_df['movieId'] = test_df['movieId'].astype(int).astype(str)
test_df['userId'] = test_df['userId'].astype(int).astype(str)
test_df['userId_movieId'] = test_df['userId'] + '_' + test_df['movieId']
cols = ['userId_movieId', 'rating']
test_df = test_df[cols]

test_df.to_csv('submission_tfidf_justkeywords_svdtrunc_kmeans.csv', index = False)

