In [9]:
import os
import pickle
import pandas as pd
# from googletrans import Translator
from cleantext import clean

import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sharhad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
# input is csv

class Data:
    def __init__(self, 
                 excel_filename = 'VMR Python Data TEST Sep\'22.xlsx', 
                 csv_filename = 'movies.csv',
                 train = True,
                 convert = False, 
                 translate = False, 
                 stem = False, 
                 lemm = True):
        if convert: self.convert_to_csv(csv_filename = csv_filename, 
                                        excel_filename = excel_filename,
                                        train = train)
        df = self.read_csv(filename = csv_filename)
        df = self.clean_data(df, translate = False, stem = False, lemm = True, train = train)
        self.save_df(df, filename = csv_filename)
        
    def convert_to_csv(self, csv_filename = 'movies.csv', 
                       excel_filename = 'VMR Python Data TEST Sep\'22.xlsx',
                       train = True):
        header = ['name']
        if train:
            header = ['name', 'class']
        df = pd.DataFrame(pd.read_excel(excel_filename))
        df.to_csv(csv_filename, index = None, header = header)

    def read_csv(self, filename = 'movies.csv'):
        return pd.read_csv(filename)

    def clean_data(self, df, translate = False, stem = False, lemm = True, train = True):
#         translator = Translator()
        stop = stopwords.words('english')
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()

        df['name'] = df['name'].str.replace('[^A-Za-z0-9 ]+', ' ')
        if translate: 
            df['name'] = df['name'].apply(lambda x: translator.translate(x, dest = 'en'))
        df['name'] = df['name'].apply(lambda x: clean(x))
        df['name'] = df['name'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
        if stem:
            df['name'] = df['name'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
        if lemm:
            df['name'] = df['name'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

        if train:
            df['class'] = df['class'].map({'Entertainment': 0, 'News': 1, 'Sports': 2})
        df = df.dropna()
        return df

    def save_df(self, df, filename = 'movies.csv'):
        df.to_csv(filename, index = None)
        print('Data has been cleaned and saved at {}'.format(filename))

In [40]:
class Train:
    def __init__(self, 
                 csv_filename = 'movies.csv',
                 excel_filename = 'VMR Python Data TEST Sep\'22.xlsx',
                 delete_csv = False,
                 delete_excel = False,
                 model_filename = 'model.pkl'):
        self.train(csv_filename = 'movies.csv', model_filename = 'model.pkl')
        if delete_csv:
            os.remove(csv_filename)
        if delete_excel:
            os.remove(excel_filename)
        
    def get_data(self, 
                 excel_filename = 'VMR Python Data TEST Sep\'22.xlsx',
                 csv_filename = 'movies.csv'):
        Data(excel_filename = 'VMR Python Data TEST Sep\'22.xlsx', 
             csv_filename = 'movies.csv',
            convert = True)
        df = shuffle(pd.read_csv(csv_filename).dropna())
        X = df['name']
        y = df['class']
        return X, y
    
    def train(self, csv_filename = 'movies.csv', model_filename = 'model.pkl'):
        X, y = self.get_data(csv_filename = csv_filename)
        movie_clf = Pipeline([
             ('vect', CountVectorizer(stop_words = 'english')),
             ('tfidf', TfidfTransformer()),
             ('clf', LogisticRegression(C = 100.0, random_state = 1, solver = 'lbfgs', multi_class = 'ovr'))
        ])
        model = movie_clf.fit(X, y)
        pickle.dump(model, open(model_filename, 'wb'))
        print('Model saved at {}'.format(model_filename))

In [45]:
class Predict:
    def __init__(self, 
                 csv_filename = 'movies.csv',
                 out_filename = 'movies_predict.csv',
                 excel_filename = 'VMR Python Data TEST Sep\'22_predict.xlsx',
                 model_filename = 'model.pkl',
                 delete_csv = False,
                 delete_excel = False):
        self.predict(csv_filename = 'movies.csv',
                     excel_filename = 'VMR Python Data TEST Sep\'22_predict.xlsx',
                     model_filename = 'model.pkl',
                     out_filename = 'movies_predict.csv')
        if delete_csv:
            os.remove(csv_filename)
        if delete_excel:
            os.remove(excel_filename)
        
    def get_data(self, 
                 excel_filename = 'VMR Python Data TEST Sep\'22_predict.xlsx', 
                 csv_filename = 'movies.csv'):
        Data(excel_filename = 'VMR Python Data TEST Sep\'22_predict.xlsx', 
             csv_filename = 'movies.csv', 
             convert = True, train = False)
        df = pd.read_csv(csv_filename).dropna()
        X = df['name']
        return X
    
    def predict(self, 
                csv_filename = 'movies.csv',
                excel_filename = 'VMR Python Data TEST Sep\'22_predict.xlsx',
                model_filename = 'model.pkl',
                out_filename = 'movies_predict.csv'):
        X = self.get_data(csv_filename = 'movies.csv',
                 excel_filename = 'VMR Python Data TEST Sep\'22_predict.xlsx')
        model = pickle.load(open(model_filename, 'rb'))
        y_predict = model.predict(X)
        pd.DataFrame({'name': X, 'class': y_predict}).to_csv(out_filename, index = False)
        print('Predictions saved at {}'.format(out_filename))

In [46]:
Train()

Data has been cleaned and saved at movies.csv
Predictions saved at movies_predict.csv


<__main__.Predict at 0x7fd07a4c64c0>

In [None]:
Predict()