# Load Data

In [None]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import string,re
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,f1_score,precision_score
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.ensemble import BaggingClassifier,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import joblib, pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [None]:
def loading_data(file_path):
    if file_path == 'TSA_datasets/sentiment140.csv':
        df = pd.read_csv('TSA_datasets/sentiment140.csv',encoding='latin-1',names=['sentiment','user_id','date','query','user','text'])
        df = df[['text','sentiment']]
        df['sentiment']=df['sentiment'].apply(lambda x: set_polarity(x))
    elif file_path == 'TSA_datasets/IMDB Dataset.csv':
        df = pd.read_csv(file_path,index_col=0)
    else:
        df = pd.read_csv(file_path,index_col=0)
        df = pd.DataFrame(columns=['reviewText','overall'],data=df)
        df['text'] = df['reviewText']
        df['sentiment'] = df['overall']
        df.drop(columns=['reviewText','overall'],axis=1,inplace=True)
        df['sentiment']=df['sentiment'].apply(lambda x: set_polarity(x))
        df.dropna(inplace=True)
    return df.head(100)
def set_polarity(sentiment):
    if sentiment > 3:
        return 'positive'
    elif sentiment < 3:
        return 'negative'

# Data Cleaning

### lower case

In [None]:
def lower_case(df):
    df['text'] = df['text'].apply(lambda x: x.lower())
    return df

### remove punctuation

In [None]:
def remove_punctuations(df):
    print('removing punctuation')
    df['text'] = df['text'].str.replace('[^\w\s]','')
    return df

### remove hyperlinks

In [None]:
def remove_hyperlinks_usernames(df):
    print('removing hyperlinks and usernames')
    df['text'] = df['text'].apply(lambda x: remove_h_u(x))
    return df
def remove_h_u(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links = re.findall(link_regex,text)
    for link in links:
        text = text.replace(link[0],'')
    prefix = ['@']
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in prefix:
                words.append(word)
    return ' '.join(words)

### remove emojis

In [None]:
def remove_emojis(df):
    print('removing emojis')
    df['text'] = df['text'].apply(lambda x: remove_e(x))
    return df
def remove_e(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'',text)

### remove stop words

In [None]:
def remove_stopwords(df):
    print('removing stop words')
    df['text'] = df['text'].apply(lambda x: remove_sw(x))
    return df
def remove_sw(text):
    stop_words = stopwords.words('english')
    words = []
    for word in text.split():
        if word not in stop_words:
            words.append(word)
    return ' '.join(words)

### tokenization

In [None]:
def tokenization(df):
    print('tokenizing')
    df['text'] = df['text'].apply(lambda x: perform_t(x))
    return df
def perform_t(text):
    text = nltk.word_tokenize(text)
    return text

### lemmatization

In [None]:
def lemmatization(df):
    print('lemmatizing text')
    df['text'] = df['text'].apply(lambda x: perform_l(x))
    return df
def perform_l(text):
    wnl = nltk.WordNetLemmatizer()
    text = [wnl.lemmatize(token) for token in text]
    return text

### stemming

In [None]:
def stemming(df):
    print('stemming text')
    df['text'] = df['text'].apply(lambda x: perform_s(x))
    return df
def perform_s(text):
    sp = nltk.PorterStemmer()
    text =[sp.stem(token) for token in text]
    return text

### data cleaner

In [None]:
def cleaning_data(df):
    lower_case(df)
    remove_hyperlinks_usernames(df)
    remove_punctuations(df)
    remove_emojis(df)
    remove_stopwords(df)
    tokenization(df)
    lemmatization(df)
    stemming(df)
    return df

# Feature Extraction

In [None]:
def feature_extraction(df):
    # count vectorizer
    print('Extracting Features')
    
    # TF-IDF
    vectorizer = TfidfVectorizer()
    vectorizer.fit(df['text'].astype(str))
    feature_vector = vectorizer.transform(df['text'].astype(str))
    
    # save vectorizer
    pickle.dump(vectorizer, open("transformers/twitter/vectorizer.pickle", "wb"))
    
    return feature_vector

# Feature Selection

In [None]:
def feature_selection(feature_vector,df):
    print('Selecting Features')
    # defining variables
    X = feature_vector
    y = df['sentiment']
    
    # chi2
    selector = SelectKBest(chi2,k = 10000)
    selector.fit(X,y)
    X = selector.transform(X)
    
    # save selector
    pickle.dump(selector, open("transformers/twitter/selector.pickle", "wb"))
    
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
    
    return X_train, X_test, y_train, y_test

# Classification

In [None]:
def classification(X_train, y_train):
    classifiers = {
        'LR' : make_pipeline(LogisticRegression()),
        'MNB' : make_pipeline(MultinomialNB()),
        'SVM' : make_pipeline(LinearSVC(dual=False))
        'MLP' : make_pipeline(MLPClassifier(random_state=1,hidden_layer_sizes=(5000),verbose=True,early_stopping=True)),
        'ADA-B' : make_pipeline(AdaBoostClassifier())
    }
    hyperparameter_grid = {
        'LR' : {
            'logisticregression__max_iter':[500,1000,1500,2000]
        },
        'MNB' : {
            'multinomialnb__alpha':[0.001,0.005,0.01,0.05,0.10,0.50,0.99,1.0]
        },
        'SVM' : {
            'linearsvc__C': [0.1,1, 10, 100],
            'linearsvc__max_iter':[100,250,500,1000,2000,3000] 
        },
        'MLP' :         {
            'mlpclassifier__hidden_layer_sizes':[(100),(200),(300),(400),(500)],
        },
        'ADA-B' : {
            'adaboostclassifier__learning_rate':[0.1,0.5,1.0,1.5],
        }
    }
    models = {}
    for classifier,pipeline in classifiers.items():
        print('starting training {}'.format(classifier))
        model = GridSearchCV(pipeline,hyperparameter_grid[classifier])
        model.fit(X_train,y_train)
        models[classifier] = model
        print('finished training {}'.format(classifier))
    return models

# Evaluation

In [None]:
def evaluation(models,X_test,y_test,df):
    for model in models.keys():
        predictions = models[model].predict(X_test)
        print(predictions)
        print('\n')
        
        # Accuracy Score
        print('{} accuracy score : {} {}'.format(model,accuracy_score(y_test,predictions)*100,'%'))
        
        # Precision
        print('{} Precision score : {} {}'.format(model,precision_score(y_test,predictions, pos_label='positive')*100,'%'))
        
        # Recall
        print('{} Recall score : {} {}'.format(model,recall_score(y_test,predictions,pos_label='positive')*100,'%'))
        
        # F-1 Score
        print('{} F-1 score : {} {}'.format(model,f1_score(y_test,predictions,pos_label='positive')*100,'%'))
        
        # Confusion Matrix
        cm = confusion_matrix(y_test,predictions,labels = df['sentiment'].unique())
        df_cm = pd.DataFrame(cm,columns=df['sentiment'].unique(),index = df['sentiment'].unique())
        for i in df_cm:
            df_cm[i] = df_cm[i]/df_cm[i].sum()
        print(model,'Confusion Matrix:','\n',df_cm)
        
        print('{} best Hyperarameter: {}'.format(model,models[model].best_estimator_))

In [None]:
def save_models(models):
    for model in models.keys():
        print('{}'.format(model))
        model_name = 'models/final models/binary class/{}.pk1'.format(model)
        joblib.dump(models[model],model_name)

In [None]:
def current_time():
    from datetime import datetime
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time =", current_time)

# Main

In [None]:
# Load Data - into a data frame
data_sets = {
    'tweets' : 'TSA_datasets/sentiment140.csv',
    'movie_reviews' : 'TSA_datasets/IMDB Dataset.csv',
    'electronics' : 'TSA_datasets/Electronics.csv',
    'amazon_instant_videos' : 'TSA_datasets/Amazon_Instant_Video.csv',
    'beauty' : 'TSA_datasets/beauty.csv',
    'cell_phones_and_accessories' : 'TSA_datasets/cell_phones_and_accessories.csv',
    'clothing_shoes_and_jewelry' : 'TSA_datasets/clothing_shoes_and_jewelry.csv',
    'health_and_personal_care' : 'TSA_datasets/health_and_personal_care.csv',
    'sports_and_outdoors' : 'TSA_datasets/sports_and_outdoors.csv'  
}

# Clean Data - pre-processing data before training
df = cleaning_data(df)

# Feature Extraction - returns a feature vector
feature_vector = feature_extraction(df)

# Loadind data from cleaned file
# df = pd.read_csv('TSA_datasets/clean_twitter_data.csv')

# Feature Selection - returns a feature vector with a subset of the most important features(data splitting as well)
X_train, X_test, y_train, y_test = feature_selection(feature_vector,df)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

# Classification - multiple different classification algorithms applied on to train the model(dict for classifiers and HPG)
models = classification(X_train, y_train)

# Evaluation - evaluation of the different trained models
evaluation(models,X_test,y_test,df)

# Save Models
save_models(models)

# convert to csv

In [None]:
def json_to_csv():
    files = {
        'beauty' : 'TSA_datasets/json/Beauty.json',
        'cell_phones_and_accessories' : 'TSA_datasets/json/Cell_Phones_and_Accessories.json',
        'clothing_shoes_and_jewelry' : 'TSA_datasets/json/Clothing_Shoes_and_Jewelry.json',
        'health_and_personal_care' : 'TSA_datasets/json/Health_and_Personal_Care.json',
        'sports_and_outdoors' : 'TSA_datasets/json/Sports_and_Outdoors.json',
        'toys_and_games' : 'TSA_datasets/json/Toys_and_Games.json'
    }
    for file in files.keys():
        print('{}'.format(file))
        df = pd.read_json(files[file],lines=True)
        df.to_csv('TSA_datasets/{}.csv'.format(file))