In [3]:
import pandas as pd
# from googletrans import Translator
from cleantext import clean

import nltk
from nltk.corpus import stopwords
# nltk.download("stopwords")

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, FeatureHasher, TfidfTransformer
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [4]:
def convert_to_csv(filename = 'VMR Python Data TEST Sep\'22.xlsx'):
    df = pd.DataFrame(pd.read_excel(filename))
    df.to_csv('movies.csv', index = None, header = ['name', 'class'])

def read_csv(filename = 'movies.csv'):
    return pd.read_csv(filename)

In [7]:
def clean_data(df, translate = False, stem = False, lemm = True):
#     translator = Translator()
    stop = stopwords.words('english')
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    df['name'] = df['name'].str.replace('[^A-Za-z0-9 ]+', ' ')
    if translate: 
        df['name'] = df['name'].apply(lambda x: translator.translate(x, dest = 'en'))
    df['name'] = df['name'].apply(lambda x: clean(x))
    df['name'] = df['name'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    if stem:
        df['name'] = df['name'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
    if lemm:
        df['name'] = df['name'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
    
    df['class'] = df['class'].map({'Entertainment': 0, 'News': 1, 'Sports': 2})
    df = df.dropna()
    return df

def save_df(df, filename = 'movies_cleaned.csv'):
    df.to_csv(filename, index = None)
    
def get_data(filename = 'movies_cleaned.csv'):
    df = shuffle(pd.read_csv(filename).dropna())
    X = df['name']
    y = df['class']
    return X, y

In [8]:
# convert_to_csv()
df = read_csv()
df = clean_data(df)
save_df(df)

In [9]:
def bag_of_words(X):
    vectorizer = CountVectorizer(stop_words = 'english')
    X = vectorizer.fit_transform(X)
    return X

def one_hot_encoding(X):
    one_hot_encoder = OneHotEncoder()
    X = X.values.reshape(-1, 1)
    X = one_hot_encoder.fit_transform(X)
    return X

def word_2_vector(X):
    w2v_model = gensim.models.Word2Vec(X, vector_size = 100, window = 5, min_count = 2)

def glove(X):
    return X

def tfidf(X):
    tfidf_vectorizer = TfidfVectorizer(max_df = 0.8, max_features = 10000)
    X = tfidf_vectorizer.fit_transform(X)
    return X

In [10]:
def logistic_regression(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    lr = LogisticRegression(C = 100.0, random_state = 1, solver = 'lbfgs', multi_class = 'ovr')
    lr.fit(X_train, y_train)
    y_predict = lr.predict(X_test)
    print(y_predict)
    print("Logistic Regression Accuracy %.3f" %metrics.accuracy_score(y_test, y_predict))
    
def sgd_classifier(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    sgd = SGDClassifier(loss = 'hinge', penalty = 'l2', alpha = 1e-3, random_state = 42, max_iter = 20, tol = None)
    sgd.fit(X_train, y_train)
    y_predict = sgd.predict(X_test)
    print("SGD Classifier Accuracy %.3f" %metrics.accuracy_score(y_test, y_predict))
    
def linear_svc(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    lsvc = LinearSVC()
    lsvc.fit(X_train, y_train)
    y_predict = lsvc.predict(X_test)
    print("Linear SVC Accuracy %.3f" %metrics.accuracy_score(y_test, y_predict))
    
def knn(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    knn = KNeighborsClassifier(n_neighbors = 3)
    knn.fit(X_train, y_train)
    y_predict = knn.predict(X_test)
    print("K Neighbors Classifier Accuracy %.3f" %metrics.accuracy_score(y_test, y_predict))
    
def tree(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    tree = DecisionTreeClassifier()
    tree.fit(X_train, y_train)
    y_predict = tree.predict(X_test)
    print("Decision Tree Classifier Accuracy %.3f" %metrics.accuracy_score(y_test, y_predict))
    
def nn(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    nn = MLPClassifier(n_neighbors = 3)
    nn.fit(X_train, y_train)
    y_predict = nn.predict(X_test)
    print("MLP Classifier Accuracy %.3f" %metrics.accuracy_score(y_test, y_predict))
    
def naive_bayes(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    y_predict = nb.predict(X_test)
    print("Naive Bayes Classifier Accuracy %.3f" %metrics.accuracy_score(y_test, y_predict))
    
def random_forest(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    random_forest = RandomForestClassifier()
    random_forest.fit(X_train, y_train)
    y_predict = random_forest.predict(X_test)
    print("Random Forest Classifier Accuracy %.3f" %metrics.accuracy_score(y_test, y_predict))
    
    

In [16]:
X, y = get_data()

# X = bag_of_words(X)
X = tfidf(X)
logistic_regression(X, y)

[0 0 0 ... 0 0 2]
Logistic Regression Accuracy 0.982


In [20]:
X, y = get_data()
X_train, X_test, y_train, y_test = train_test_split(X, y)
movie_clf = Pipeline([
     ('vect', CountVectorizer(stop_words = 'english')),
     ('tfidf', TfidfTransformer()),
     ('clf', LogisticRegression(C = 100.0, random_state = 1, solver = 'lbfgs', multi_class = 'ovr'))
])

movie_clf.fit(X, y)
y_predict = movie_clf.predict(X_test)
print("Classifier Accuracy %.3f" %metrics.accuracy_score(y_test, y_predict))

Classifier Accuracy 0.992
