## Imports and loding data

In [10]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from matplotlib import pyplot as plt
import seaborn as sns
#import eli5
import csv
import numpy as np

In [31]:
movies= pd.read_csv('MoviesWithSynopsisAndCategory.csv')
# get drama and romance movies only to have 4 labels
movies=movies[(movies['تصنيف الفيلم']=='دراما')|(movies['تصنيف الفيلم']=='رومانسي')|(movies['تصنيف الفيلم']=='كوميدي')|(movies['تصنيف الفيلم']=='ﺗﺸﻮﻳﻖ ﻭﺇﺛﺎﺭﺓ')]
movies=movies.reset_index(drop=True)
movies = movies[["ملخص",'تصنيف الفيلم']]  # we are interested in rating and review only
movies.columns = ["text", "label"]
movies["label"].replace('رومانسي', 'romance', inplace=True)
movies["label"].replace('دراما', 'drama', inplace=True)
movies["label"].replace('كوميدي', 'comedy', inplace=True)
movies["label"].replace('ﺗﺸﻮﻳﻖ ﻭﺇﺛﺎﺭﺓ', 'action', inplace=True)


In [32]:
movies.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
action,105
comedy,126
drama,556
romance,246


In [33]:
movies.head(2)

Unnamed: 0,text,label
0,دار حدث فيلم خيري فتاه ثري مخطوب ابناء عموم تف...,romance
1,سافر سطا عبدالرحمن صاحب ورشه ميكانزم زميل رحله...,drama


## Old Classifier


In [34]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(movies, test_size=0.2,random_state=101)

In [35]:
text_transformer = TfidfVectorizer( ngram_range=(1, 2),  max_features=150000)

In [36]:
%%time
X_train_text = text_transformer.fit_transform(train['text'])
X_test_text = text_transformer.transform(test['text'])

CPU times: user 275 ms, sys: 3.46 ms, total: 278 ms
Wall time: 281 ms


In [37]:
X_train_text.shape,X_test_text.shape

((826, 42817), (207, 42817))

In [39]:
logit = LogisticRegression(C=5e1, solver='lbfgs', multi_class='multinomial', random_state=17, n_jobs=4)

In [40]:
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=17)


In [41]:
%%time
cv_results = cross_val_score(logit, X_train_text, train['label'], cv=skf, scoring='f1_micro')

CPU times: user 99 ms, sys: 54.9 ms, total: 154 ms
Wall time: 5.75 s


In [42]:
cv_results, cv_results.mean()


(array([0.52058111, 0.53753027]), 0.5290556900726392)

In [43]:
%%time
logit.fit(X_train_text, train['label'])

CPU times: user 36 ms, sys: 5.89 ms, total: 41.9 ms
Wall time: 3.15 s


LogisticRegression(C=50.0, multi_class='multinomial', n_jobs=4, random_state=17)

In [44]:
# %%time
# eli5.show_weights(estimator=logit, 
#                   feature_names= list(text_transformer.get_feature_names()),
#                  top=(50, 5))

In [45]:
test_preds = logit.predict(X_test_text)
pd.DataFrame(test_preds, columns=['تصنيف'])


Unnamed: 0,تصنيف
0,drama
1,drama
2,romance
3,drama
4,drama
...,...
202,romance
203,drama
204,drama
205,romance


In [46]:
count=0
a=list(test_preds)
b=list(test['label'])
for i in range(len(test)):
    if a[i]==b[i]:
        count+=1
print("Accuracy Precentage ="+str(round((count/len(test))*100,3)))

Accuracy Precentage =61.836


In [47]:
from sklearn import metrics
print(metrics.confusion_matrix(b, a))
# Printing the precision and recall, among other metrics
print(metrics.classification_report(b, a))

[[  1   0  19   0]
 [  1   2  18   6]
 [  0   1 110   8]
 [  0   1  25  15]]
              precision    recall  f1-score   support

      action       0.50      0.05      0.09        20
      comedy       0.50      0.07      0.13        27
       drama       0.64      0.92      0.76       119
     romance       0.52      0.37      0.43        41

    accuracy                           0.62       207
   macro avg       0.54      0.35      0.35       207
weighted avg       0.58      0.62      0.55       207



## Various Classifiers

In [48]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv

%matplotlib inline

In [49]:
# Helper functions 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
def train_model(model, data, targets):
    text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', model),
    ])
    text_clf.fit(data, targets)
    return text_clf
def get_accuracy(trained_model,X, y):
    predicted = trained_model.predict(X)
    accuracy = np.mean(predicted == y)
    #--------
    from sklearn import metrics
    print(metrics.confusion_matrix(y, predicted))
    # Printing the precision and recall, among other metrics
    print(metrics.classification_report(y, predicted))
    #---------
    return accuracy

In [55]:
from sklearn.tree import DecisionTreeClassifier
trained_clf_decision_tree = train_model(DecisionTreeClassifier(),train['text'],train['label'])
accuracy = get_accuracy(trained_clf_decision_tree,test['text'],test['label'])
print(f"Test dataset accuracy with DecisionTreeClassifier: {accuracy:.2f}")

[[ 2  1 15  2]
 [ 1  4 19  3]
 [14 11 70 24]
 [ 2  3 17 19]]
              precision    recall  f1-score   support

      action       0.11      0.10      0.10        20
      comedy       0.21      0.15      0.17        27
       drama       0.58      0.59      0.58       119
     romance       0.40      0.46      0.43        41

    accuracy                           0.46       207
   macro avg       0.32      0.32      0.32       207
weighted avg       0.45      0.46      0.45       207

Test dataset accuracy with DecisionTreeClassifier: 0.46


In [56]:
from sklearn.naive_bayes import MultinomialNB
trained_clf_multinomial_nb = train_model(MultinomialNB(), train['text'],train['label'])
accuracy = get_accuracy(trained_clf_multinomial_nb,test['text'],test['label'])
print(f"Test dataset accuracy with MultinomialNB: {accuracy:.2f}")

[[  0   0  20   0]
 [  0   0  27   0]
 [  0   0 119   0]
 [  0   0  41   0]]
              precision    recall  f1-score   support

      action       0.00      0.00      0.00        20
      comedy       0.00      0.00      0.00        27
       drama       0.57      1.00      0.73       119
     romance       0.00      0.00      0.00        41

    accuracy                           0.57       207
   macro avg       0.14      0.25      0.18       207
weighted avg       0.33      0.57      0.42       207

Test dataset accuracy with MultinomialNB: 0.57


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [57]:
from sklearn.svm import LinearSVC
trained_clf_linearSVC = train_model(LinearSVC(),train['text'],train['label'])
accuracy = get_accuracy(trained_clf_linearSVC,test['text'],test['label'])
print(f"Test dataset accuracy with LinearSVC: {accuracy:.2f}")

[[  1   0  17   2]
 [  0   5  12  10]
 [  0   3 108   8]
 [  0   1  24  16]]
              precision    recall  f1-score   support

      action       1.00      0.05      0.10        20
      comedy       0.56      0.19      0.28        27
       drama       0.67      0.91      0.77       119
     romance       0.44      0.39      0.42        41

    accuracy                           0.63       207
   macro avg       0.67      0.38      0.39       207
weighted avg       0.64      0.63      0.57       207

Test dataset accuracy with LinearSVC: 0.63


In [58]:
from sklearn.ensemble import RandomForestClassifier
trained_clf_random_forest = train_model(RandomForestClassifier(), train['text'],train['label'])
accuracy = get_accuracy(trained_clf_random_forest,test['text'],test['label'])
print(f"Test dataset accuracy with RandomForestClassifier: {accuracy:.2f}")

[[  1   0  19   0]
 [  0   0  24   3]
 [  0   0 118   1]
 [  0   1  37   3]]
              precision    recall  f1-score   support

      action       1.00      0.05      0.10        20
      comedy       0.00      0.00      0.00        27
       drama       0.60      0.99      0.74       119
     romance       0.43      0.07      0.12        41

    accuracy                           0.59       207
   macro avg       0.51      0.28      0.24       207
weighted avg       0.52      0.59      0.46       207

Test dataset accuracy with RandomForestClassifier: 0.59


In [59]:
from sklearn.linear_model import LogisticRegression
trained_clf_LogisticRegression = train_model(LogisticRegression(), train['text'],train['label'])
accuracy = get_accuracy(trained_clf_LogisticRegression,test['text'],test['label'])
print(f"Test dataset accuracy with LogisticRegression: {accuracy:.2f}")

[[  0   0  20   0]
 [  0   0  24   3]
 [  0   0 118   1]
 [  0   0  34   7]]
              precision    recall  f1-score   support

      action       0.00      0.00      0.00        20
      comedy       0.00      0.00      0.00        27
       drama       0.60      0.99      0.75       119
     romance       0.64      0.17      0.27        41

    accuracy                           0.60       207
   macro avg       0.31      0.29      0.25       207
weighted avg       0.47      0.60      0.48       207

Test dataset accuracy with LogisticRegression: 0.60


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
