In [1]:
import numpy as pd
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string
warnings.filterwarnings('ignore')

    # Reading the Data
    Read data from the file and split it into rows
    Split each row into columns
    Create a Pandas DataFrame


In [2]:
with open("./train_data.txt", 'r') as file:
    text_data = file.read().strip().split('\n')

data = [row.split(' ::: ') for row in text_data]

dataset = pd.DataFrame(data, columns=["ID", "TITLE", "GENRE", "DESCRIPTION"])

In [3]:
dataset.head()

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [4]:
dataset.columns

Index(['ID', 'TITLE', 'GENRE', 'DESCRIPTION'], dtype='object')

In [5]:
dataset.isna().sum()

ID             0
TITLE          0
GENRE          0
DESCRIPTION    0
dtype: int64

In [6]:
# Use regular expressions to extract the year from the TITLE column
#dataset['YEAR'] = dataset['TITLE'].str.extract(r'\((\d{4})\)')

# Convert the YEAR column to integers
#dataset['YEAR'] = dataset['YEAR'].astype(int)

# Use regular expressions to remove the year from the TITLE column
dataset['TITLE'] = dataset['TITLE'].str.replace(r'\(\d{4}\)', '').str.strip()


In [7]:
dataset.isna().sum()

ID             0
TITLE          0
GENRE          0
DESCRIPTION    0
dtype: int64

In [8]:
dataset['TITLE'] = dataset['TITLE'].str.replace(r'\(\d{4}\)', '').str.strip()

In [9]:
#dataset['TITLE']

In [10]:
# Remove parentheses and their contents using regular expressions
dataset['TITLE'] = dataset['TITLE'].str.replace(r'\([^)]*\)', '')

In [11]:
dataset.head()

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Oscar et la dame rose,drama,Listening in to a conversation between his doc...
1,2,Cupid,thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin,drama,To help their unemployed father make ends meet...
4,5,The Unrecovered,drama,The film's title refers not only to the un-rec...


Convert the Genre type
___

In [12]:
import category_encoders as ce 

encoder = ce.OrdinalEncoder(cols=['GENRE'])
dataset = encoder.fit_transform(dataset)

In [13]:
dataset['GENRE'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27])

# Cleaning the Raw Data using NTk(Natural Language Toolkit)

Text preprocessing is a crucial step in natural language processing (NLP) and machine learning tasks. 

It involves cleaning and transforming raw text data into a format that can be effectively used for training machine learning models. 

Here are common text preprocessing steps in NLP:

    1.Lowercasing
    2.Tokenization using NLTK
    3.Removing Punctuation
    4.Stop Word Removal using NLTK
    5.Stemming using NLTK
    6.Handling Contractions and Abbreviations
    7.Removing Numbers and Special Characters
    8.Token Filtering
    9.Text Vectorization using Count Vectorization and TF-IDF Vectorization

1. Lowercasing
___

In [14]:
dataset['TITLE'] = dataset['TITLE'].str.lower()
dataset['DESCRIPTION'] = dataset['DESCRIPTION'].str.lower()

In [15]:
dataset.head()

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,oscar et la dame rose,1,listening in to a conversation between his doc...
1,2,cupid,2,a brother and sister with a past incestuous re...
2,3,"young, wild and wonderful",3,as the bus empties the students for their fiel...
3,4,the secret sin,1,to help their unemployed father make ends meet...
4,5,the unrecovered,1,the film's title refers not only to the un-rec...


2. Tokenization using ntlk
___

In [16]:
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


False

In [17]:
from nltk.tokenize import word_tokenize

def tokenize_text(text) :
    return word_tokenize(text)

dataset['TITLE'] = dataset['TITLE'].apply(tokenize_text)
dataset['DESCRIPTION'] = dataset['DESCRIPTION'].apply(tokenize_text)

In [18]:
dataset.head()

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,"[oscar, et, la, dame, rose]",1,"[listening, in, to, a, conversation, between, ..."
1,2,[cupid],2,"[a, brother, and, sister, with, a, past, inces..."
2,3,"[young, ,, wild, and, wonderful]",3,"[as, the, bus, empties, the, students, for, th..."
3,4,"[the, secret, sin]",1,"[to, help, their, unemployed, father, make, en..."
4,5,"[the, unrecovered]",1,"[the, film, 's, title, refers, not, only, to, ..."


3. Removing Noise:
{removing irrelevant characters from the text}
___

In [19]:
def remove_numbers(tokens):
    return [ token for token in tokens if not token.isdigit()]

dataset['TITLE'] = dataset['TITLE'].apply(remove_numbers)
dataset['DESCRIPTION'] = dataset['DESCRIPTION'].apply(remove_numbers)

In [20]:
import string

def remove_punctuations(tokens):
    return [ token for token in tokens if not token in string.punctuation ]


dataset['TITLE'] = dataset['TITLE'].apply(remove_punctuations)
dataset['DESCRIPTION'] = dataset['DESCRIPTION'].apply(remove_punctuations)

In [21]:
from nltk.corpus import stopwords

def remove_stopwords(tokens):
    stopwords_list = stopwords.words('english')
    return [ token for token in tokens if token not in stopwords_list]

dataset['TITLE'] = dataset['TITLE'].apply(remove_stopwords)
dataset['DESCRIPTION'] = dataset['DESCRIPTION'].apply(remove_stopwords)

In [22]:
dataset.head()

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,"[oscar, et, la, dame, rose]",1,"[listening, conversation, doctor, parents, 10-..."
1,2,[cupid],2,"[brother, sister, past, incestuous, relationsh..."
2,3,"[young, wild, wonderful]",3,"[bus, empties, students, field, trip, museum, ..."
3,4,"[secret, sin]",1,"[help, unemployed, father, make, ends, meet, e..."
4,5,[unrecovered],1,"[film, 's, title, refers, un-recovered, bodies..."


4. Stemming using NLTK
___


In [23]:
from nltk import SnowballStemmer

lang = "english"

stemmer = SnowballStemmer(lang)
def adding_Stemming(tokens):
    return [ stemmer.stem(token) for token in tokens ]

dataset['TITLE'] = dataset['TITLE'].apply(adding_Stemming)
dataset['DESCRIPTION'] = dataset['DESCRIPTION'].apply(adding_Stemming)


In [24]:
dataset.head()

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,"[oscar, et, la, dame, rose]",1,"[listen, convers, doctor, parent, 10-year-old,..."
1,2,[cupid],2,"[brother, sister, past, incestu, relationship,..."
2,3,"[young, wild, wonder]",3,"[bus, empti, student, field, trip, museum, nat..."
3,4,"[secret, sin]",1,"[help, unemploy, father, make, end, meet, edit..."
4,5,[unrecov],1,"[film, 's, titl, refer, un-recov, bodi, ground..."


# Text Vectorization using Count Vectorization and TF-IDF Vectorization

In [25]:
d2set = dataset
d2set.head()

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,"[oscar, et, la, dame, rose]",1,"[listen, convers, doctor, parent, 10-year-old,..."
1,2,[cupid],2,"[brother, sister, past, incestu, relationship,..."
2,3,"[young, wild, wonder]",3,"[bus, empti, student, field, trip, museum, nat..."
3,4,"[secret, sin]",1,"[help, unemploy, father, make, end, meet, edit..."
4,5,[unrecov],1,"[film, 's, titl, refer, un-recov, bodi, ground..."


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)

tfidf_matrix = tfidf_vectorizer.fit_transform([" ".join(doc) for doc in d2set['TITLE'] + d2set['DESCRIPTION']])

tfidf_d2set = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

#d2set = pd.concat([d2set, tfidf_d2set], axis=1)


In [27]:
tfidf_d2set.head()

Unnamed: 0,000,10,100,11,12,13,14,15,16,17,...,youth,youtub,yu,zach,zealand,zen,zero,zombi,zone,zoo
0,0.0,0.141476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.168875,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.1846,0.0,0.0,0.0


In [28]:
d2set.head()

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,"[oscar, et, la, dame, rose]",1,"[listen, convers, doctor, parent, 10-year-old,..."
1,2,[cupid],2,"[brother, sister, past, incestu, relationship,..."
2,3,"[young, wild, wonder]",3,"[bus, empti, student, field, trip, museum, nat..."
3,4,"[secret, sin]",1,"[help, unemploy, father, make, end, meet, edit..."
4,5,[unrecov],1,"[film, 's, titl, refer, un-recov, bodi, ground..."


Spliting the date to Training and Testing
___

In [29]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(tfidf_d2set,d2set['GENRE'],test_size=0.2,random_state=42)

# Model Buildng For validation Set

Logistic Regression
___

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report

tf_log = LogisticRegression()
tf_log.fit(x_train,y_train)

In [31]:
pred_log_valid = tf_log.predict(x_test)
print("Logistic Regression on Validation set")
print("--------------------------------------------------------------")
print("Accuracy Score : ",accuracy_score(pred_log_valid,y_test)) 
print("Classification Report : \n",classification_report(pred_log_valid,y_test))

Logistic Regression on Validation set
--------------------------------------------------------------
Accuracy Score :  0.584616803467675
Classification Report : 
               precision    recall  f1-score   support

           1       0.77      0.54      0.64      3813
           2       0.18      0.43      0.25       127
           3       0.29      0.75      0.42        44
           4       0.84      0.67      0.74      3322
           5       0.60      0.53      0.56      1622
           6       0.04      0.31      0.07        13
           7       0.18      0.49      0.26        70
           8       0.61      0.66      0.64       397
           9       0.30      0.61      0.40        46
          10       0.13      0.61      0.22        23
          11       0.29      0.53      0.38       145
          12       0.00      0.00      0.00         1
          13       0.33      0.47      0.39       721
          14       0.26      0.59      0.36        63
          15       0.50   

Random Forest Classifier
___

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report

tf_rfc = RandomForestClassifier(max_depth=15,verbose=0,random_state=42,n_estimators=20)
tf_rfc.fit(x_train,y_train)

In [33]:
pred_rfc_valid = tf_rfc.predict(x_test)
print("Random Forest Classifier on Validation set")
print("--------------------------------------------------------------")
print("Accuracy Score : ",accuracy_score(pred_rfc_valid,y_test))
print("Calssification Report : \n",classification_report(pred_rfc_valid,y_test))

Random Forest Classifier on Validation set
--------------------------------------------------------------
Accuracy Score :  0.44249746380153093
Calssification Report : 
               precision    recall  f1-score   support

           1       0.84      0.38      0.52      5950
           2       0.00      1.00      0.01         1
           3       0.01      1.00      0.02         1
           4       0.87      0.50      0.64      4580
           5       0.14      0.73      0.23       270
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.01      1.00      0.01         3
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         0
          15       

Test dataset
---

In [34]:
with open("./test_data.txt", 'r') as file:
    text_data = file.read().strip().split('\n')

data = [row.split(' ::: ') for row in text_data]

test_data = pd.DataFrame(data, columns=["ID", "TITLE", "DESCRIPTION"])

In [35]:
test_data.head()

Unnamed: 0,ID,TITLE,DESCRIPTION
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),Before he was known internationally as a marti...


In [36]:
test_data.columns

Index(['ID', 'TITLE', 'DESCRIPTION'], dtype='object')

In [37]:
test_data.isna().sum()

ID             0
TITLE          0
DESCRIPTION    0
dtype: int64

In [38]:
test_data['TITLE'] = test_data['TITLE'].str.replace(r'\(\d{4}\)', '').str.strip()


In [39]:
test_data['TITLE'] = test_data['TITLE'].str.replace(r'\(\d{4}\)', '').str.strip()

In [40]:
# Remove parentheses and their contents using regular expressions
test_data['TITLE'] = test_data['TITLE'].str.replace(r'\([^)]*\)', '')

In [41]:
test_data.head()

Unnamed: 0,ID,TITLE,DESCRIPTION
0,1,Edgar's Lunch,"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá,"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track,One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu,"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai,Before he was known internationally as a marti...


Cleaning the Raw Data using NTk(Natural Language Toolkit)
___


In [42]:
test_data['TITLE'] = test_data['TITLE'].str.lower()
test_data['DESCRIPTION'] = test_data['DESCRIPTION'].str.lower()

In [43]:
test_data['TITLE'] = test_data['TITLE'].apply(tokenize_text)
test_data['DESCRIPTION'] = test_data['DESCRIPTION'].apply(tokenize_text)

In [44]:
test_data['TITLE'] = test_data['TITLE'].apply(remove_numbers)
test_data['DESCRIPTION'] = test_data['DESCRIPTION'].apply(remove_numbers)

test_data['TITLE'] = test_data['TITLE'].apply(remove_punctuations)
test_data['DESCRIPTION'] = test_data['DESCRIPTION'].apply(remove_punctuations)

test_data['TITLE'] = test_data['TITLE'].apply(remove_stopwords)
test_data['DESCRIPTION'] = test_data['DESCRIPTION'].apply(remove_stopwords)

In [45]:
test_data['TITLE'] = test_data['TITLE'].apply(adding_Stemming)
test_data['DESCRIPTION'] = test_data['DESCRIPTION'].apply(adding_Stemming)

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)

tfidf_matrix = tfidf_vectorizer.fit_transform([" ".join(doc) for doc in test_data['TITLE'] + test_data['DESCRIPTION']])

tfidf_test = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

#d2set = pd.concat([d2set, tfidf_d2set], axis=1)


In [47]:
tfidf_test = tfidf_test[list(set(tfidf_test.columns).intersection(set(tfidf_d2set.columns)))]

tfidf_test.head()

Unnamed: 0,coupl,mega,greatest,imposs,downtown,shane,flashback,jew,arrog,karl,...,lov,pop,campbel,scienc,artifici,han,creation,form,connect,vancouv
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.079915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
tfidf_d2set.head()

Unnamed: 0,000,10,100,11,12,13,14,15,16,17,...,youth,youtub,yu,zach,zealand,zen,zero,zombi,zone,zoo
0,0.0,0.141476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.168875,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.1846,0.0,0.0,0.0


In [49]:
#add colums to tfidf_test where those columns are not present in tfidf_test and present in tfidf_d2set
for col in list(set(tfidf_d2set.columns).difference(set(tfidf_test.columns))):
    tfidf_test[col] = 0

tfidf_test.head()

Unnamed: 0,coupl,mega,greatest,imposs,downtown,shane,flashback,jew,arrog,karl,...,3rd,awesom,christi,elsewher,cultiv,deadlin,shirley,tycoon,comeback,afflict
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.079915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
for col in tfidf_test.columns :
    if col not in tfidf_d2set.columns :
        print(col)

In [51]:
with open("./test_data_solution.txt", 'r') as file:
    text_data = file.read().strip().split('\n')

data = [row.split(' ::: ') for row in text_data]

validation = pd.DataFrame(data, columns=["ID", "TITLE", "GENRE", "DESCRIPTION"])

In [52]:
validation.head()

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a marti...


In [53]:
validation = encoder.fit_transform(validation)

In [54]:
validation.isna().sum()

ID             0
TITLE          0
GENRE          0
DESCRIPTION    0
dtype: int64

In [55]:
validation.head()

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Edgar's Lunch (1998),1,"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),2,"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),3,One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),4,"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),4,Before he was known internationally as a marti...


In [56]:
validation.shape

(54200, 4)

In [57]:
tfidf_test.shape

(54200, 5000)

In [58]:
sorted_tfidf_test = tfidf_test[tfidf_d2set.columns]

In [59]:
sorted_tfidf_test.head()

Unnamed: 0,000,10,100,11,12,13,14,15,16,17,...,youth,youtub,yu,zach,zealand,zen,zero,zombi,zone,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.133558,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0


In [60]:
tfidf_d2set.head()

Unnamed: 0,000,10,100,11,12,13,14,15,16,17,...,youth,youtub,yu,zach,zealand,zen,zero,zombi,zone,zoo
0,0.0,0.141476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.168875,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.1846,0.0,0.0,0.0


# Testing the Model on Test Data

Logistic Regression
___

In [61]:
pred_log = tf_log.predict(sorted_tfidf_test)
print("Logistic Regression on Validation set")
print("--------------------------------------------------------------")
print("Accuracy Score : ",accuracy_score(pred_log,validation['GENRE'])) 
print("Classification Report : \n",classification_report(pred_log,validation['GENRE']))

Logistic Regression on Validation set
--------------------------------------------------------------
Accuracy Score :  0.04549815498154981
Classification Report : 
               precision    recall  f1-score   support

           1       0.54      0.04      0.08     19273
           2       0.00      0.04      0.00       556
           3       0.00      0.04      0.00       223
           4       0.09      0.07      0.08     16732
           5       0.06      0.02      0.03      8203
           6       0.00      0.04      0.00        50
           7       0.00      0.00      0.00       279
           8       0.01      0.00      0.01      1971
           9       0.29      0.67      0.40       187
          10       0.00      0.00      0.00        83
          11       0.07      0.01      0.02       780
          12       0.00      0.00      0.00        21
          13       0.03      0.00      0.00      3448
          14       0.00      0.00      0.00       281
          15       0.02 

Random Forest Classifer
___

In [62]:
pred_rfc = tf_rfc.predict(sorted_tfidf_test)
print("Logistic Regression on Validation set")
print("--------------------------------------------------------------")
print("Accuracy Score : ",accuracy_score(pred_rfc,validation['GENRE'])) 
print("Classification Report : \n",classification_report(pred_rfc,validation['GENRE']))

Logistic Regression on Validation set
--------------------------------------------------------------
Accuracy Score :  0.06715867158671587
Classification Report : 
               precision    recall  f1-score   support

           1       0.84      0.04      0.08     29982
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.17      0.10      0.13     22699
           5       0.01      0.01      0.01      1277
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00        22
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         0
          15       0.00 