In [None]:
import numpy as np
import pandas as pd 

# model use for prediction
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# for split the dataset and test set
from sklearn.model_selection import train_test_split

# for turning plot from text to frequency matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# for choosing the features (words)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# for stemming and lemmatizing the texts
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer



In [None]:
# we have split the original dataset to new set plot and label
# we have convert the label from string to indicator variable using panda.get_dummies sand then make it a new csv file name labels.csv
# we have drop all the other atrributes and make it a new csv file name data.csv 

# read the input file
X = pd.read_csv('../input/movie-plot-title-data/data.csv', index_col = 'Unnamed: 0')['0']
y = pd.read_csv('../input/movie-plot-title-data/labels.csv', index_col = 'Unnamed: 0')

# list of the 19 genres
genre_list = ['action', 'adventure', 'animated', 'biopic', 'comedy', 'crime', 'drama', 'family', 'fantasy', 'film-noir', 'horror', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western']

In [None]:
# print the plot example
print(X[0])

In [None]:
# print the distribution of genres 
fig = plt.figure(figsize = (20, 12))
plt.bar(y.columns, y.sum(axis = 0))

In [None]:
# the dataset has 26259 rows
print(len(X))

In [None]:

# we use nltk stemmer and lemmatizer for out plot
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

# function for stemming and lemmatizing text
# the function take very long times to run 
def text_processing():
    # preprocessing text by stemming and lemmatizing
    for i in range(len(X)):
        X[i] = ' '.join(stemmer.stem(word) for word in X[i].split())
        X[i] = ' '.join(lemmatizer.lemmatize(word) for word in X[i].split())    
text_processing()

In [None]:
print(X[0])

In [None]:
# for tranform the text to tf-ìdf matrix or occurance matrix
# Choose between tfidfvectorizer and countvectorizer, change the param max_features
# tfidf_vec_ = CountVectorizer(stop_words = 'english', max_features = 10000)
tfidf_vec_ = TfidfVectorizer(stop_words = 'english')

#split the dataset into train set and test set with train set size = 0.8, test set size = 0.2
#train_test_split() automatic shuffle the rows
xtrain, xtest, ytrain, ytest = train_test_split(X,y, random_state = 0, train_size = 0.8)

# tranform x train to matrix and fit x test to matrix
xtrain = tfidf_vec_.fit_transform(xtrain)
xtest = tfidf_vec_.transform(xtest)




In [None]:


def reduce_feature():
    # the function choose the features with highest chi2 score
    univariate_selection = SelectKBest(chi2 ,k = 50000)
    univariate_selection.fit(xtrain, ytrain)
    univariate_score = univariate_selection.scores_

#     plot the distribution of the score
    fig = plt.figure(figsize = (20, 20))
    plt.hist(univariate_score,bins = [0,1,2,3,4,5,6,7,8,9, 10])
    fig = plt.figure(figsize = (20, 20))

    plt.hist(univariate_score,bins = [10,20,30,40,50,60,70,80,90,100])
    


    chosen_features = np.where(univariate_score > 5)[0]
    print(chosen_features)
    print(len(chosen_features))
    reduced_train = xtrain[:,chosen_features]
    reduced_test = xtest[:,chosen_features]
    return reduced_train, reduced_test
reduced_train, reduced_test = reduce_feature()





In [None]:
# evaluate the model with reduced features, return list of f1 score of classes
def eval_model_reduced(estimator):
    model = OneVsRestClassifier(estimator)
    model.fit(reduced_train, ytrain)
    predict = model.predict(reduced_test)
    test_score = f1_score(ytest, predict, average = None)
    print(test_score)
    return test_score
# evaluate the model, return list of f1 score of classes
def eval_model_normal(estimator):
    model = OneVsRestClassifier(estimator)
    model.fit(xtrain, ytrain)
    predict = model.predict(xtest)
    test_score = f1_score(ytest, predict, average = None)
    print(test_score)
    return test_score
    
    

In [None]:
# predict the test
result = eval_model_reduced(BernoulliNB(alpha=0.05))

In [None]:
# plot the results
fig = plt.figure(figsize = (20, 12))
axes = plt.axes()
axes.set_ylim([0, 0.7])
plt.bar(y.columns, result)