In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.sparse as sparse
import warnings
import matplotlib.pyplot as plt
from langdetect import detect
from googletrans import Translator
import re
import string
from sentence_transformers import SentenceTransformer

warnings.filterwarnings('ignore')

# Training data

In [None]:
train_data = pd.read_csv(r'data/train_data.txt', sep=":::", names=["Title","Genre","Summary"])
train_data.head()

### Unique genres

In [None]:
counts = train_data['Genre'].value_counts()

fig,axis = plt.subplots(figsize=(30,10))
plt.bar(counts.index, counts.values)
plt.xlabel('Genre')
plt.ylabel('Number of movies')
plt.show()

### Unique languages

In [None]:
uniq_langs = set()
langs = []
for i in list(train_data["Summary"]):
    lang = detect(i)
    langs.append(lang)
    uniq_langs.add(lang)

train_data["Language"] = langs
print(uniq_langs)

len(train_data[~train_data.Language.isin(['en'])])

In [None]:
# train_data.to_csv('data/train_data_with_languages.csv')
train_data = pd.read_csv('../data/train_data_with_languages.csv', index_col=0)
train_data.head()

In [None]:
# So we have multiple genres of movies - drama, documentary, etc. 
# But out of those, the number of entries of a few genres is very high, which might be a cause of bias.

# We also have multiple languages in the description. 
# But the number is very small compared to English. Hence, for simplicity, we shall only consider those in English for now.

train_data = pd.read_csv('../data/train_data_with_languages.csv', index_col=0)
train_data = train_data[train_data['Language']=='en']
train_data.head()

In [None]:
def clean_text(text):
    text = re.sub('Mail <svaradi@sprynet.com> for translation. ','',text)
    text = re.sub(r'@\S+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(f'[{string.punctuation}]','',text)
    text = re.sub(f'[{string.digits}]','',text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
    return text

train_data['Summary'] = train_data['Summary'].apply(clean_text)


# The number of datapoints in "drama", "documentary","comedy", "short" is way larger and skewed compared to other classes. Hence we shall simply clump all the other classes together.

l = [' thriller ',' adult ',' crime ',' reality-tv ',' horror ',' sport ',' animation ',' action ',' fantasy ',' sci-fi ',' music ',' adventure ',' talk-show ',' western ',' family ',' mystery ',' history ',' news ',' biography ',' romance ',' game-show ',' musical ',' war ']
train_data["Genre"] = train_data["Genre"].replace(to_replace=l,value='other')
train_data["Genre"] = train_data["Genre"].replace(to_replace=[' drama '],value='drama')
train_data["Genre"] = train_data["Genre"].replace(to_replace=[' documentary '],value='documentary')
train_data["Genre"] = train_data["Genre"].replace(to_replace=[' comedy '],value='comedy')
train_data["Genre"] = train_data["Genre"].replace(to_replace=[' short '],value='short')

train_data.head()

In [None]:
X_train = train_data["Summary"]
Y_train = train_data["Genre"]
print("Before vectorisation: ",X_train.shape)

# model = SentenceTransformer('all-MiniLM-L6-v2')
# sentence_embeddings = model.encode(list(X_train))

print("After vectorisation:",sentence_embeddings.shape)

x_train = pd.DataFrame(sentence_embeddings)
x_train.head()

In [None]:
plt.figure(figsize=(6,5))
sns.countplot(data=Y_train,x=Y_train.values,palette='rocket')
sns.set(rc={'figure.figsize':(8,6)})
plt.xticks(rotation=45)
plt.show()

In [None]:
x_train.to_csv('E:/SkyBug Technology Internship/Skybug-Movie-Genre-Classification/data/x_train.csv', index=False)
Y_train.to_csv('E:/SkyBug Technology Internship/Skybug-Movie-Genre-Classification/data/y_train.csv', index=False) 

# Testing data

In [None]:
test_data = pd.read_csv(r'E:/Skybug Technology Internship/Skybug-Movie-Genre-Classification/data/test_data_solution.txt', sep=":::", names=["Id","Title","Genre","Summary"])
X_test = test_data["Summary"]
Y_test = test_data["Genre"]

X_test = X_test.apply(clean_text)

In [None]:
counts = Y_test.value_counts()

fig,axis = plt.subplots(figsize=(30,10))
plt.bar(counts.index, counts.values)
plt.xlabel('Genre')
plt.ylabel('Number of movies')
plt.show()

In [None]:
l = [' thriller ',' adult ',' crime ',' reality-tv ',' horror ',' sport ',' animation ',' action ',' fantasy ',' sci-fi ',' music ',' adventure ',' talk-show ',' western ',' family ',' mystery ',' history ',' news ',' biography ',' romance ',' game-show ',' musical ',' war ']
Y_test = Y_test.replace(to_replace=l,value='other')
Y_test = Y_test.replace(to_replace=[' drama '],value='drama')
Y_test = Y_test.replace(to_replace=[' documentary '],value='documentary')
Y_test = Y_test.replace(to_replace=[' comedy '],value='comedy')
Y_test = Y_test.replace(to_replace=[' short '],value='short')

Y_test.head()

In [None]:
print("Before vectorisation: ",X_test.shape)

model = SentenceTransformer('all-MiniLM-L6-v2')
sentence_embeddings = model.encode(list(X_test))

print("After vectorisation:",sentence_embeddings.shape)

plt.figure(figsize=(6,5))
sns.countplot(data=Y_test,x=Y_test.values,palette='rocket')
sns.set(rc={'figure.figsize':(8,6)})
plt.xticks(rotation=45)
plt.show()

In [None]:
X_test = pd.DataFrame(sentence_embeddings)
X_test.to_csv('E:/SkyBug Technology Internship/Skybug-Movie-Genre-Classification/data/x_test.csv', index=False)
Y_test.to_csv('E:/SkyBug Technology Internship/Skybug-Movie-Genre-Classification/data/y_test.csv', index=False) 