Importing Libraries


In [None]:
import pandas as pd
import numpy as np
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Importing dataset

In [None]:
train_data = pd.read_csv('/content/train_data.txt', sep=':::', names=['Title','Genre','Description'], engine='python')
test_data = pd.read_csv('/content/test_data.txt', sep=':::', names=['ID','Title','Description'], engine='python')

In [None]:
print(train_data)

                                              Title          Genre  \
1                     Oscar et la dame rose (2009)          drama    
2                                     Cupid (1997)       thriller    
3                 Young, Wild and Wonderful (1980)          adult    
4                            The Secret Sin (1915)          drama    
5                           The Unrecovered (2007)          drama    
...                                             ...            ...   
54210                              "Bonino" (1953)         comedy    
54211                  Dead Girls Don't Cry (????)         horror    
54212    Ronald Goedemondt: Ze bestaan echt (2008)    documentary    
54213                     Make Your Own Bed (1944)         comedy    
54214   Nature's Fury: Storm of the Century (2006)        history    

                                             Description  
1       Listening in to a conversation between his do...  
2       A brother and sister with a past 

In [None]:
print(test_data)

          ID                             Title  \
0          1             Edgar's Lunch (1998)    
1          2         La guerra de papá (1977)    
2          3      Off the Beaten Track (2010)    
3          4           Meu Amigo Hindu (2015)    
4          5                Er nu zhai (1955)    
...      ...                               ...   
54195  54196   "Tales of Light & Dark" (2013)    
54196  54197      Der letzte Mohikaner (1965)    
54197  54198              Oliver Twink (2007)    
54198  54199                Slipstream (1973)    
54199  54200        Curitiba Zero Grau (2010)    

                                             Description  
0       L.R. Brane loves his life - his car, his apar...  
1       Spain, March 1964: Quico is a very naughty ch...  
2       One year in the life of Albin and his family ...  
3       His father has died, he hasn't spoken with hi...  
4       Before he was known internationally as a mart...  
...                                          

In [None]:
train_data.head()

Unnamed: 0,Title,Genre,Description
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [None]:
train_data.tail()

Unnamed: 0,Title,Genre,Description
54210,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on ...
54211,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The sist...
54212,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about g..."
54213,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and hav...
54214,Nature's Fury: Storm of the Century (2006),history,"On Labor Day Weekend, 1935, the most intense ..."


In [None]:
test_data.head()

Unnamed: 0,ID,Title,Description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),Before he was known internationally as a mart...


In [None]:
test_data.tail()

Unnamed: 0,ID,Title,Description
54195,54196,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Da..."
54196,54197,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their...
54197,54198,Oliver Twink (2007),A movie 169 years in the making. Oliver Twist...
54198,54199,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard..."
54199,54200,Curitiba Zero Grau (2010),"Curitiba is a city in movement, with rhythms ..."


In [None]:
train_data.describe()

Unnamed: 0,Title,Genre,Description
count,54214,54214,54214
unique,54214,27,54086
top,Oscar et la dame rose (2009),drama,Grammy - music award of the American academy ...
freq,1,13613,12


In [None]:
test_data.describe()

Unnamed: 0,ID
count,54200.0
mean,27100.5
std,15646.336632
min,1.0
25%,13550.75
50%,27100.5
75%,40650.25
max,54200.0


Checking for null values

In [None]:
train_data.isnull().sum()

Title          0
Genre          0
Description    0
dtype: int64

In [None]:
test_data.isnull().sum()

ID             0
Title          0
Description    0
dtype: int64

In [None]:
train_data.drop(columns = 'Title', axis=1, inplace=True)

In [None]:
train_data.head()

Unnamed: 0,Genre,Description
1,drama,Listening in to a conversation between his do...
2,thriller,A brother and sister with a past incestuous r...
3,adult,As the bus empties the students for their fie...
4,drama,To help their unemployed father make ends mee...
5,drama,The film's title refers not only to the un-re...


In [None]:
test_data.drop(columns = 'ID', axis=1, inplace=True)

In [None]:
test_data.head()

Unnamed: 0,Title,Description
0,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,Er nu zhai (1955),Before he was known internationally as a mart...


In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54214 entries, 1 to 54214
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Genre        54214 non-null  object
 1   Description  54214 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


In [None]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54200 entries, 0 to 54199
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        54200 non-null  object
 1   Description  54200 non-null  object
dtypes: object(2)
memory usage: 847.0+ KB


In [None]:
train_data.shape

(54214, 2)

In [None]:
train_data['Genre'].value_counts()

Genre
 drama           13613
 documentary     13096
 comedy           7447
 short            5073
 horror           2204
 thriller         1591
 action           1315
 western          1032
 reality-tv        884
 family            784
 adventure         775
 music             731
 romance           672
 sci-fi            647
 adult             590
 crime             505
 animation         498
 sport             432
 talk-show         391
 fantasy           323
 mystery           319
 musical           277
 biography         265
 history           243
 game-show         194
 news              181
 war               132
Name: count, dtype: int64

Text Cleaning

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english')) # stop words
def clean_data(text):
    text = text.lower()
    text = re.sub(r'@\S+','',text)                       # [1] remove mentions
    text = re.sub(r'http\S+', '', text)                  # [2] remove urls
    text = re.sub(r'[\w\.-]+@[\w\.-]+',"",text)          # [3] remove emails
    text = re.sub(r"[^a-zA-Z+']", ' ', text)             # [4] keep only english chars / remove numbers
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text+' ')      # [5] remove single chars
    text = re.sub(r'pic.\S+', '',text)                   # [6]
    text = re.sub(r'#', "", text)                        # [7] remove hashtags
    text = re.sub(r"_", "  ", text)                      # [8] remove hashtags
    text = re.sub('\n'," . ",text)                       # [9] remove new lines
    text = re.sub('\[[^]]*\]','',text)                   # [10] remove square prackets
    text = "".join([char for char in text if char not in string.punctuation]) # [11] remove punctuations
    text= re.sub("\s[\s]+", " ",text).strip()            # [12] remove repeated/leading/trailing spaces
    tokens = word_tokenize(text)                         # [13] Tokenize
    text = " ".join([word for word in tokens if word not in stop_words and len(word) > 2]) # [14] remove stop words

    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
train_data['Description'] = train_data['Description'].apply(clean_data)
test_data['Description'] = test_data['Description'].apply(clean_data)

Stemming

In [None]:
port_stem = PorterStemmer()
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [None]:
train_data['Description'] = train_data['Description'].apply(stemming)
test_data['Description'] = test_data['Description'].apply(stemming)

Label Encoding

In [None]:
le = LabelEncoder()
train_data['Genre'] = le.fit_transform(train_data['Genre'].values)

In [None]:
train_data.head()

Unnamed: 0,Genre,Description
1,8,listen convers doctor parent year old oscar le...
2,24,brother sister past incestu relationship curre...
3,1,bu empti student field trip museum natur histo...
4,8,help unemploy father make end meet edith twin ...
5,8,film titl refer recov bodi ground zero also st...


In [None]:
#Separating data as text and labels
X = train_data['Description']
Y = train_data['Genre']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=2)

In [None]:
print(X.shape, X_train.shape,X_test.shape)

(54214,) (43371,) (10843,)


Feature Extraction

In [None]:
feature_extraction = TfidfVectorizer(stop_words='english', max_features=100000)
X_train_features = vectorize.fit_transform(X_train)
X_test_features = vectorize.transform(X_test)

#Convert Y_train and Y_test data as integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [None]:
print(X_train_features)

  (0, 38681)	0.2190274086172623
  (0, 15439)	0.1771258830382819
  (0, 60194)	0.20020769771075522
  (0, 49925)	0.14053191641227894
  (0, 15077)	0.14773449702734814
  (0, 43740)	0.12529511373010171
  (0, 18123)	0.20941125415850448
  (0, 4849)	0.1418262600740845
  (0, 49774)	0.1439822075223621
  (0, 15529)	0.14589943586391946
  (0, 45088)	0.22508184949433854
  (0, 28003)	0.28051932543701247
  (0, 10880)	0.1349335596457277
  (0, 65378)	0.18055822421206988
  (0, 30513)	0.16363168102104744
  (0, 22281)	0.08561418330966841
  (0, 21726)	0.14056613270849402
  (0, 73634)	0.21692658115129826
  (0, 26801)	0.23280082769504687
  (0, 23193)	0.07601322057640841
  (0, 485)	0.1305476316073142
  (0, 23953)	0.2100855595305037
  (0, 64846)	0.2526654617705323
  (0, 5682)	0.11725996504972172
  (0, 44522)	0.4412423451346086
  :	:
  (43370, 13821)	0.10910275951058011
  (43370, 51697)	0.2919462579804407
  (43370, 61218)	0.08877238550147964
  (43370, 71949)	0.0909745381079262
  (43370, 53297)	0.08896393827435946

In [None]:
print(X_test_features)

  (0, 76816)	0.07881545635281943
  (0, 72041)	0.22173218109972276
  (0, 70431)	0.16034963463555946
  (0, 69631)	0.16173302836372866
  (0, 68937)	0.1746324367944506
  (0, 64646)	0.15560893009429186
  (0, 61742)	0.2167042752182481
  (0, 53082)	0.2276419422927205
  (0, 52165)	0.2167042752182481
  (0, 49479)	0.09592122928654798
  (0, 48767)	0.19277263726775998
  (0, 48073)	0.1681520520182075
  (0, 44313)	0.14248565441862482
  (0, 39470)	0.07309913230554893
  (0, 39186)	0.17011008040784165
  (0, 30632)	0.19151994565224417
  (0, 24255)	0.1687189626348375
  (0, 23548)	0.19002445206851798
  (0, 23214)	0.13809356312746443
  (0, 23193)	0.1603680022896094
  (0, 22281)	0.09031176051294121
  (0, 21772)	0.18710809903274928
  (0, 21443)	0.16936985351810402
  (0, 16270)	0.11319995574994131
  (0, 14734)	0.13175141750874417
  :	:
  (10841, 6792)	0.22998265870173862
  (10841, 2645)	0.21671693248806717
  (10842, 77034)	0.2852103853002602
  (10842, 70821)	0.1694508242758395
  (10842, 69472)	0.1047026066802

Train the model - Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train_features, Y_train)

Evaluating The trained Model

In [None]:
#prediction on training data
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [None]:
print("Accuracy on training data : ", accuracy_on_training_data)

Accuracy on training data :  0.7051947153628


In [None]:
#prediction on testing data
prediction_on_testing_data = model.predict(X_test_features)
accuracy_on_testing_data = accuracy_score(Y_test, prediction_on_testing_data)

In [None]:
print('Accuracy on test data: ', accuracy_on_testing_data)

Accuracy on test data:  0.5769620953610625
