In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from sklearn.metrics import confusion_matrix
from multiprocessing import Pool

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmetizer = WordNetLemmatizer()

stop_words = stopwords.words('english')
stopWords = set(stop_words)
punctuation = set(string.punctuation)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mskie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mskie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mskie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mskie\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
df = pd.read_csv('Genre_Classification_Dataset/train_data.txt', sep=':::', header = None)
df.columns = ["id", "title", "genre", "description"]
print(df.head())

  df = pd.read_csv('Genre_Classification_Dataset/train_data.txt', sep=':::', header = None)


   id                               title       genre  \
0   1       Oscar et la dame rose (2009)       drama    
1   2                       Cupid (1997)    thriller    
2   3   Young, Wild and Wonderful (1980)       adult    
3   4              The Secret Sin (1915)       drama    
4   5             The Unrecovered (2007)       drama    

                                         description  
0   Listening in to a conversation between his do...  
1   A brother and sister with a past incestuous r...  
2   As the bus empties the students for their fie...  
3   To help their unemployed father make ends mee...  
4   The film's title refers not only to the un-re...  


In [4]:
genre_counts = df['genre'].value_counts()
print(genre_counts)

genre
drama           13613
documentary     13096
comedy           7447
short            5073
horror           2204
thriller         1591
action           1315
western          1032
reality-tv        884
family            784
adventure         775
music             731
romance           672
sci-fi            647
adult             590
crime             505
animation         498
sport             432
talk-show         391
fantasy           323
mystery           319
musical           277
biography         265
history           243
game-show         194
news              181
war               132
Name: count, dtype: int64


In [5]:
def process_text(doc):
  token = nltk.word_tokenize(doc)
  token = [lemmetizer.lemmatize(word.lower()) for word in token if word not in punctuation and word.lower() not in stopWords]
  clean_text = ' '.join(token)
  return clean_text


df['clean_description]'] = df['description'].apply(process_text)

print(df.head())

   id                               title       genre  \
0   1       Oscar et la dame rose (2009)       drama    
1   2                       Cupid (1997)    thriller    
2   3   Young, Wild and Wonderful (1980)       adult    
3   4              The Secret Sin (1915)       drama    
4   5             The Unrecovered (2007)       drama    

                                         description  \
0   Listening in to a conversation between his do...   
1   A brother and sister with a past incestuous r...   
2   As the bus empties the students for their fie...   
3   To help their unemployed father make ends mee...   
4   The film's title refers not only to the un-re...   

                                  clean_description]  
0  listening conversation doctor parent 10-year-o...  
1  brother sister past incestuous relationship cu...  
2  bus empty student field trip museum natural hi...  
3  help unemployed father make end meet edith twi...  
4  film 's title refers un-recovered body gro

In [6]:
vectorizer = TfidfVectorizer(max_features = 5000, ngram_range=(1,3))
X = vectorizer.fit_transform(df['clean_description]'])
y = df['genre']

In [7]:
df_test = pd.read_csv('Genre_Classification_Dataset/test_data_solution.txt', sep=':::', header=None)
df_test.columns = ['id', 'title', 'genre', 'description']

print(df_test.head())

df_test['clean_description'] = df_test['description'].apply(process_text)
print("\n after cleaning")
print(df_test.head())

x_test = vectorizer.fit_transform(df_test['clean_description'])
print("\n after vectorizing")
print(x_test)

  df_test = pd.read_csv('Genre_Classification_Dataset/test_data_solution.txt', sep=':::', header=None)


   id                          title          genre  \
0   1          Edgar's Lunch (1998)       thriller    
1   2      La guerra de papá (1977)         comedy    
2   3   Off the Beaten Track (2010)    documentary    
3   4        Meu Amigo Hindu (2015)          drama    
4   5             Er nu zhai (1955)          drama    

                                         description  
0   L.R. Brane loves his life - his car, his apar...  
1   Spain, March 1964: Quico is a very naughty ch...  
2   One year in the life of Albin and his family ...  
3   His father has died, he hasn't spoken with hi...  
4   Before he was known internationally as a mart...  

 after cleaning
   id                          title          genre  \
0   1          Edgar's Lunch (1998)       thriller    
1   2      La guerra de papá (1977)         comedy    
2   3   Off the Beaten Track (2010)    documentary    
3   4        Meu Amigo Hindu (2015)          drama    
4   5             Er nu zhai (1955)          dr

In [8]:
ytest  = df_test['genre']
print(ytest.head())

0        thriller 
1          comedy 
2     documentary 
3           drama 
4           drama 
Name: genre, dtype: object


In [9]:
model = LogisticRegression(C = 0.057, max_iter=1000)
model.fit(X, y)

In [10]:
y_ = model.predict(X)
print("train accuracy: ", accuracy_score(y, y_))
y_hat = model.predict(x_test)
print("test accuracy: ", accuracy_score(ytest, y_hat))
print("classification report: ", classification_report(ytest, y_hat))

train accuracy:  0.5045929095805511
test accuracy:  0.297619926199262


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


classification report:                 precision    recall  f1-score   support

      action        0.00      0.00      0.00      1314
       adult        0.00      0.00      0.00       590
   adventure        0.00      0.00      0.00       775
   animation        0.00      0.00      0.00       498
   biography        0.00      0.00      0.00       264
      comedy        0.24      0.05      0.08      7446
       crime        0.00      0.00      0.00       505
 documentary        0.29      0.76      0.42     13096
       drama        0.32      0.42      0.37     13612
      family        0.00      0.00      0.00       783
     fantasy        0.00      0.00      0.00       322
   game-show        0.00      0.00      0.00       193
     history        0.00      0.00      0.00       243
      horror        0.08      0.00      0.00      2204
       music        0.00      0.00      0.00       731
     musical        0.00      0.00      0.00       276
     mystery        0.00      0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
conf_matrix = confusion_matrix(ytest, y_hat)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[   0    0    0    0    0   26    0  797  489    0    0    0    0    1
     0    0    0    0    0    0    0    1    0    0    0    0    0]
 [   0    0    0    0    0   23    0  304  258    0    0    0    0    0
     0    0    0    0    0    0    0    5    0    0    0    0    0]
 [   0    0    0    0    0   12    0  549  213    0    0    0    0    0
     0    0    0    0    0    0    0    1    0    0    0    0    0]
 [   0    0    0    0    0   10    0  359  126    0    0    0    0    0
     0    0    0    0    0    0    0    3    0    0    0    0    0]
 [   0    0    0    0    0    8    0  176   80    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0  347    0 4294 2793    0    0    0    0    5
     0    0    0    0    0    0    0    7    0    0    0    0    0]
 [   0    0    0    0    0   12    0  304  189    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0]
 