# Load Data

In [1]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import string,re
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,f1_score,precision_score
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import joblib

In [2]:
def loading_data(file_path):
    print('Loading data')
    df = pd.read_csv(file_path,encoding='latin-1',names=['rating','user_id','date','query','user','text'])
    df = pd.DataFrame(columns=['text','rating'],data=df)
    df['rating']=df['rating'].apply(lambda x: set_polarity(x))
    return df
def set_polarity(rating):
    if rating == 4:
        return 1
    elif rating == 0:
        return -1

# Data Cleaning

### lower case

In [3]:
def lower_case(df):
    df['text'] = df['text'].apply(lambda x: x.lower())
    return df

### remove punctuation

In [4]:
def remove_punctuations(df):
    print('removing punctuation')
    df['text'] = df['text'].str.replace('[^\w\s]','')
    return df

### remove hyperlinks

In [5]:
def remove_hyperlinks_usernames(df):
    print('removing hyperlinks and usernames')
    df['text'] = df['text'].apply(lambda x: remove_h_u(x))
    return df
def remove_h_u(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links = re.findall(link_regex,text)
    for link in links:
        text = text.replace(link[0],'')
    prefix = ['@']
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in prefix:
                words.append(word)
    return ' '.join(words)

### remove emojis

In [6]:
def remove_emojis(df):
    print('removing emojis')
    df['text'] = df['text'].apply(lambda x: remove_e(x))
    return df
def remove_e(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'',text)

### remove stop words

In [7]:
def remove_stopwords(df):
    print('removing stop words')
    df['text'] = df['text'].apply(lambda x: remove_sw(x))
    return df
def remove_sw(text):
    stop_words = stopwords.words('english')
    words = []
    for word in text.split():
        if word not in stop_words:
            words.append(word)
    return ' '.join(words)

### tokenization

In [8]:
def tokenization(df):
    print('tokenizing')
    df['text'] = df['text'].apply(lambda x: perform_t(x))
    return df
def perform_t(text):
    text = re.split('\W',text)
    return text

### lemmatization

In [9]:
def lemmatization(df):
    print('lemmatizing text')
    df['text'] = df['text'].apply(lambda x: perform_l(x))
    return df
def perform_l(text):
    wnl = nltk.WordNetLemmatizer()
    text = [wnl.lemmatize(token) for token in text]
    return text

### stemming

In [10]:
def stemming(df):
    print('stemming text')
    df['text'] = df['text'].apply(lambda x: perform_s(x))
    return df
def perform_s(text):
    sp = nltk.PorterStemmer()
    text =[sp.stem(token) for token in text]
    return text

### data cleaner

In [11]:
def cleaning_data(df):
    lower_case(df)
    remove_hyperlinks_usernames(df)
    remove_punctuations(df)
    remove_emojis(df)
    remove_stopwords(df)
    tokenization(df)
    lemmatization(df)
    stemming(df)
    return df

# Feature Extraction

In [12]:
def feature_extraction(df):
    # count vectorizer
    print('Extracting Features')
#     cv = CountVectorizer()
#     cv.fit(df['text'].astype(str))
#     feature_vector = cv.transform(df['text'].astype(str))
    
    # TF-IDF
    tf_idf = TfidfVectorizer()
    tf_idf.fit(df['text'].astype(str))
    feature_vector = tf_idf.transform(df['text'].astype(str))
    return feature_vector

# Feature Selection

In [13]:
def feature_selection(feature_vector,df):
    print('Selecting Features')
    # defining variables
    X = feature_vector
    y = df['rating']
    
    # chi2
    X = SelectKBest(chi2,k=100).fit_transform(X,y)
    
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
    
    # standardizing data
    scaler = StandardScaler(with_mean=False)
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test

# Classification

In [14]:
def classification(X_train, X_test, y_train, y_test):
    classifiers = {
#         'LR' : make_pipeline(LogisticRegression()),
#         'MNB' : make_pipeline(MultinomialNB()),
#         'SVM' : make_pipeline(LinearSVC(dual=False)),
        'DT' : make_pipeline(DecisionTreeClassifier())
#         'RSS' : make_pipeline(BaggingClassifier())
    }
    hyperparameter_grid = {
        'LR' : {
            'logisticregression__max_iter':[500,1000,1500,2000]
        },
        'MNB' : {
            'multinomialnb__alpha':[0.001,0.005,0.01,0.05,0.10,0.50,0.99,1.0]
        },
        'SVM' : {
            'linearsvc__C': [0.1,1, 10, 100],
            'linearsvc__max_iter':[100,250,500,1000,2000,3000] 
        },
        'DT' : {
#             'decisiontreeclassifier__criterion' : ['gini','entropy'],
#             'decisiontreeclassifier__splitter' : ['best','random']
        },
        'RSS' : {
            'baggingclassifier__max_samples' : [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
            'baggingclassifier__n_estimators' : [10, 50, 100, 500, 500, 1000, 5000]
#             'baggingclassifier__base_estimator' : [KNeighborsClassifier(n_neighbors=1),KNeighborsClassifier(n_neighbors=5),KNeighborsClassifier(n_neighbors=15),KNeighborsClassifier(n_neighbors=20)]
        }
    }
    models = {}
    for classifier,pipeline in classifiers.items():
        print('starting training {}'.format(classifier))
        model = GridSearchCV(pipeline,hyperparameter_grid[classifier])
        model.fit(X_train,y_train)
        models[classifier] = model
        print('finished training {}'.format(classifier))
    return models

# Evaluation

In [15]:
def evaluation(models,X_test,y_test,df):
    # Accuracy Score
    for model in models.keys():
        predictions = models[model].predict(X_test)
        print('\n')
        print('{} accuracy score : {} {}'.format(model,accuracy_score(y_test,predictions)*100,'%'))
        
        # Precision
        print('{} Precision score : {} {}'.format(model,precision_score(y_test,predictions)*100,'%'))
        
        # Recall
        print('{} Recall score : {} {}'.format(model,recall_score(y_test,predictions)*100,'%'))
        
        # Confusion Matrix
        cm = confusion_matrix(y_test,predictions,labels = df['rating'].unique())
        df_cm = pd.DataFrame(cm,columns=df['rating'].unique(),index = df['rating'].unique())
        for i in df_cm:
            df_cm[i] = df_cm[i]/df_cm[i].sum()
        print(model,'\n',df_cm)

        # F-measure
        
        # ROC Area 
        # TP rate, FP rate, 
        
        print('{} best Hyperarameter: {}'.format(model,models[model].best_estimator_))

# Main

In [16]:
# Load Data - into a data frame
# file_name = 'twitter/sentiment140.csv'
# df = loading_data(file_name)

# # Clean Data - pre-processing data before training
# df = cleaning_data(df)

df = pd.read_csv('clean_text1.csv')
# df = df[df['rating']==1].head(25000).append(df[df['rating']==-1].head(25000))

# Feature Extraction - returns a feature vector
feature_vector = feature_extraction(df)

# Feature Selection - returns a feature vector with a subset of the most important features(data splitting as well)
X_train, X_test, y_train, y_test = feature_selection(feature_vector,df)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

# Classification - multiple different classification algorithms applied on to train the model(dict for classifiers and HPG)
models = classification(X_train, X_test, y_train, y_test)

# Evaluation - evaluation of the different trained models
evaluation(models,X_test,y_test,df)

Extracting Features
Selecting Features
(1280000, 100) (320000, 100) (1280000,) (320000,)
starting training DT
finished training DT


DT accuracy score : 65.07843749999999 %
DT Precision score : 61.924049632498225 %
DT Recall score : 78.3383011829106 %
DT 
           -1        1
-1  0.705109  0.38076
 1  0.294891  0.61924
DT best Hyperarameter: Pipeline(steps=[('decisiontreeclassifier', DecisionTreeClassifier())])


In [17]:
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 18:41:36
