# Importing the required libraries





In [0]:
!python -m spacy download en_core_web_lg
# After downloading the model, restart the runtime, otherwise it can't be loaded
# into memory for some reason.

Collecting en_core_web_lg==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz (826.9MB)
[K     |████████████████████████████████| 826.9MB 1.1MB/s 
[?25hBuilding wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.1.0-cp36-none-any.whl size=828255076 sha256=c783b353d78ab17683033cc374fd78ee907fdf160c8c9424a591ac0ca66f11d4
  Stored in directory: /tmp/pip-ephem-wheel-cache-s60mdk8y/wheels/b4/d7/70/426d313a459f82ed5e06cc36a50e2bb2f0ec5cb31d8e0bdf09
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [0]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [0]:
import re  
from sklearn.datasets import load_files  
import nltk
from nltk.corpus import stopwords
import io
import os
import csv
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split  
import sklearn.metrics as metrics
from sklearn.model_selection import GridSearchCV
import pickle
from nltk.stem.snowball import SnowballStemmer

nltk.download('stopwords')
nltk.download('wordnet')

# Uploading and cleaning the training data

In [0]:
df = pd.read_csv('annotated_tweets.csv')

In [0]:
df["Text"][300]
# just getting an example of a text field

'Another example of the corruption our current representation is a part of and the false claims that @SenatorCarper is for the environment. Bloomberg energy is a failing "green" company that isn\'t green. Yet is subsidized by millions of your tax dollars. https://t.co/vL1As21gWg'

In [0]:
X, y = df["Text"], df.iloc[:,0:4]
# Splitting the dataset into the text and the lables

In [0]:
# Here we perform all the pre-processing steps and save the output at different stages.

documents1 = []
documents2 = []
documents3 = []
documents4 = []


stemmer = SnowballStemmer("english")
stop_words = spacy.lang.en.stop_words.STOP_WORDS

for sen in range(0, len(X)):  
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()
    documents1.append(document)
    
    # Lemmatization & stop words
    document = document.split()
    document2 = [word for word in document if word not in stop_words]
    document2 = ' '.join(document2)
    documents2.append(document2)
    
    document3 = [stemmer.stem(word) for word in document]
    document3 = ' '.join(document3)
    documents3.append(document3)
    
    document4 = [word for word in document if word not in stop_words]
    document4 = [stemmer.stem(word) for word in document4]
    document4 = ' '.join(document4)
    documents4.append(document4)

In [0]:
documents1 # no lemmas, stop-words kept

In [0]:
documents2 # no lemmas, stop-words removed

In [0]:
documents3 # lemmas, stop-words kept

In [0]:
documents4[300] # lemmas, stop-words remvoed

'exampl corrupt current represent fals claim senatorcarp environ bloomberg energi fail green compani isn green subsid million tax dollar https co vl1as21gwg'

In [0]:
# combining the 4 versions
all_vers = [documents1, documents2, documents3, documents4] 

In [0]:
# transforming the lables into a numpy array
# and fixing the one label that had '2' instead of '1'
y = y.to_numpy()
y = np.where(y==2, 1, y) 
y.shape

(1186, 4)

# Zero-rule classifier for baseline assessement

In [0]:
# Simple metrics, shows how unbalanced the data is
zero_y = [0] * len(y)

In [0]:
for label in range(0,4):
    print("LABEL", label)
    print(metrics.classification_report(y[:,label],zero_y))
    print(metrics.roc_auc_score(y[:,label], zero_y))

# Word embedding + MLP model

In [0]:
# A function for for word embedding (average per tweet)
def nlpfy(x):
    nlp_doc = []
    for doc in x:
        temp_tweet = nlp(doc).vector
        nlp_doc.append(temp_tweet)
    return nlp_doc

In [0]:
# A function that:
# 1) splits the data into training and testing
# 2) initializes the MLP classifier
# 3) trains the model
# 4) evaluates the accuracy
def fit_model(x, y):
    
    # Split the data
    dX_train2, dX_test2, dy_train2, dy_test2 = train_test_split(x, y, test_size=0.20)
    
    # Create a space of possible parameters to choose from
    parameter_space = {
    'hidden_layer_sizes': [(475), (475,237), (100,), (300), (700)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam', 'lbfgs'],
    'alpha': [0.0001, 0.001, 0.05, 0.1],
    'learning_rate': ['constant','adaptive'],
}
    
    # Fit the data and make predicitons
    mlp = MLPClassifier(random_state=0, max_iter=400)
    clf2 = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3, verbose = 10)
    clf2.fit(dX_train2, dy_train2)
    predictions2 = clf2.predict(dX_test2)
    accu_score = []
    for label in range(0,4):
        temp_score = metrics.classification_report(dy_test2[:,label],predictions2[:,label])
        auc_score = metrics.roc_auc_score(dy_test2[:,label],predictions2[:,label])
        accu_score.append((temp_score, auc_score))
    return predictions2, accu_score, clf2

In [0]:
# Running the function above
we_results = []
best_cl = []
for vers in all_vers:
    docs = nlpfy(vers)
    predictions, accu_scores, classer = fit_model(docs, y)
    we_results.append((predictions, accu_scores))
    best_cl.append(classer)

In [0]:
# Saving the output just in case
pickle.dump(best_cl, open( "classifiers.p", "wb" ))
pickle.dump(we_results, open( "we_results.p", "wb" ))

In [0]:
# Evaluating the performance
for i in range(0,4):
  print("Docs", i)
  for y in range(0,4):
    print("label", y)
    print(we_results[i][1][y][1])

In [0]:
# The same function as the one above, just for MLPs for inidividual labels.
def fit_model_s(x, y):
    
    # Split the data
    dX_train2, dX_test2, dy_train2, dy_test2 = train_test_split(x, y, test_size=0.20)
    
    # Create a space of possible parameters to choose from
    parameter_space = {
    'hidden_layer_sizes': [(475), (475,237), (100,), (300), (700)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam', 'lbfgs'],
    'alpha': [0.0001, 0.001, 0.05, 0.1],
    'learning_rate': ['constant','adaptive'],
}
    
    # Fit the data and make predicitons
    mlp = MLPClassifier(random_state=0, max_iter=400)
    clf2 = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3, verbose = 10)
    preds = []
    accu_score = []
    classers = []

    for label in range(0,4):

      clf2.fit(dX_train2, dy_train2[:,label])
      predictions2 = clf2.predict(dX_test2)
      preds.append(predictions2)
      classers.append(clf2)

      temp_score = metrics.classification_report(dy_test2[:,label],predictions2)
      auc_score = metrics.roc_auc_score(dy_test2[:,label],predictions2)
      accu_score.append((temp_score, auc_score))

    preds = np.array(preds)
    preds = preds.transpose()
    return preds, accu_score, classers

In [0]:
we_results_s = []
best_cl_s = []
for vers in all_vers:
    docs = nlpfy(vers)
    predictions, accu_scores, classer = fit_model_s(docs, y)
    we_results_s.append((predictions, accu_scores))
    best_cl_s.append(classer)

In [0]:
pickle.dump(we_results_s, open( "we_results_s.p", "wb" ))
pickle.dump(best_cls_s, open( "best_cls_s.p", "wb" ))

# Predictions on complete data

In [0]:
# Upload the whole dataset
df3 = pd.read_csv('tweets_all_text.csv')

In [0]:
# A check to see what it looks like
df3

In [0]:
X3 = df3["Text"]

In [0]:
best_cls_s = pickle.load(open( "best_cls_s.p", "rb" ))
best_cls = pickle.load(open( "classifiers.p", "rb" ))

In [0]:
# Same pre-processing steps as above

documents_all_1 = []
documents_all_2 = []
documents_all_3 = []
documents_all_4 = []

stemmer = SnowballStemmer("english")
stop_words = spacy.lang.en.stop_words.STOP_WORDS

for sen in range(0, len(X3)):  
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X3[sen]))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()
    documents_all_1.append(document)
    
    # Lemmatization & stop words
    document = document.split()
    document2 = [word for word in document if word not in stop_words]
    document2 = ' '.join(document2)
    documents_all_2.append(document2)
    
    document3 = [stemmer.stem(word) for word in document]
    document3 = ' '.join(document3)
    documents_all_3.append(document3)
    
    document4 = [word for word in document if word not in stop_words]
    document4 = [stemmer.stem(word) for word in document4]
    document4 = ' '.join(document4)
    documents_all_4.append(document4)

In [0]:
documents_all = [documents_all_1, documents_all_2, documents_all_3, documents_all_4]

In [0]:
# Make the predictions
predictions0 = best_cl[1].predict(nlpfy(documents_all[1]))

In [0]:
# Save the predictions
pickle.dump(predictions0, open( "predictions0.p", "wb" ))

In [0]:
# Transfrom that into a pd DataFrame to save it as a csv
datasetw = pd.DataFrame({'text': df3["Text"], 'neg_t': predictions0[:,0], 'pol_att': predictions0[:,1], 'per_att': predictions0[:,2], 'inciv': predictions0[:,3]})

In [0]:
# Save the csv
open('auto_annotated.csv', "w").write(datasetw.to_csv())

3449333