In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import sklearn
import string
import re # helps you filter urls
from IPython.display import display, Latex, Markdown


In [2]:


from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Colab Notebooks

Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks


# Classifying tweets

Analyzing Twitter data related to the 2016 US Presidential election extracted using [the Twitter API](https://dev.twitter.com/overview/api). The data contains tweets posted by the following six Twitter accounts: `realDonaldTrump, mike_pence, GOP, HillaryClinton, timkaine, TheDemocrats`

For every tweet, there are two pieces of information:
- `screen_name`: the Twitter handle of the user tweeting and
- `text`: the content of the tweet.


 Overarching goal of is to "predict" the political inclination (Republican/Democratic) of the Twitter user from one of his/her tweets. The ground truth (i.e., true class labels) is determined from the `screen_name` of the tweet as follows
- `realDonaldTrump, mike_pence, GOP` are Republicans
- `HillaryClinton, timkaine, TheDemocrats` are Democrats



In [14]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()
stopwords=nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!



## Text Processing

The function  processes and tokenizes raw text. The generated list of tokens  meets the following specifications:
1. The tokens are in lower case.
2. The tokens appear in the same order as in the raw text.
3. The tokens are in their lemmatized form.
4. The tokens does not contain any punctuations.
5. The tokens do not contain any part of a url.


In [15]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
# Converting part of speech tag from nltk.pos_tag to word net compatible format
# Simple mapping based on first letter of return tag to make grading consistent
# Everything else is considered noun 'n'
posMapping = {
# "First_Letter by nltk.pos_tag":"POS_for_lemmatizer"
    "N":'n',
    "V":'v',
    "J":'a',
    "R":'r'
}
def process(text, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    """ Normalizes case and handles punctuation
    Inputs:
        text: str: raw text
        lemmatizer: an instance of a class implementing the lemmatize() method
                    (the default argument is of type nltk.stem.wordnet.WordNetLemmatizer)
    Outputs:
        list(str): tokenized text
    """
    regex=r'http[s]?://\S+|www\.\S+'
    text=re.sub(regex,'',text)
    text=text.lower()

    text = re.sub(r"'s\b", '', text)
    text = re.sub(r"'", '', text)
    regex2 = r'[!"#$%&\'()*+,\-./:;<=>?@[\\\]^_`{|}~]'
    text = re.sub(regex2, ' ', text)
    tokens = nltk.word_tokenize(text)
    tags = nltk.pos_tag(tokens)


    lemma = []

    for word, pos_tg in tags:
      pos_char = pos_tg[0]
      pos_mapped = posMapping.get(pos_char, 'n')

      try:
          lemma_word = lemmatizer.lemmatize(word, pos=pos_mapped)
          lemma.append(lemma_word)
      except Exception as e:
          continue


    return lemma

In [17]:
!pip install nltk
import nltk

nltk.download('punkt_tab')




[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [19]:
import nltk
nltk.download('averaged_perceptron_tagger')  # Correct tagger resource
nltk.download('punkt')  # For tokenization
nltk.download('wordnet')  # For lemmatization
nltk.download('stopwords')  # For stopwords


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [23]:
print(process("I'm doing well! How about you?"))
# ['im', 'do', 'well', 'how', 'about', 'you']

print(process("Education is the ability to listen to almost anything without losing your temper or your self-confidence."))
# ['education', 'be', 'the', 'ability', 'to', 'listen', 'to', 'almost', 'anything', 'without', 'lose', 'your', 'temper', 'or', 'your', 'self', 'confidence']

print(process("been had done languages cities mice"))
# ['be', 'have', 'do', 'language', 'city', 'mice']

print(process("It's hilarious. Check it out http://t.co/dummyurl"))
# ['it', 'hilarious', 'check', 'it', 'out']

print(process("See it Sunday morning at 8:30a on RTV6 and our RTV6 app. http:…"))
# ['see', 'it', 'sunday', 'morning', 'at', '8', '30a', 'on', 'rtv6', 'and', 'our', 'rtv6', 'app', 'http', '…']
# Here '…' is a special unicode character not in string.punctuation and it is still present in processed text

['im', 'do', 'well', 'how', 'about', 'you']
['education', 'be', 'the', 'ability', 'to', 'listen', 'to', 'almost', 'anything', 'without', 'lose', 'your', 'temper', 'or', 'your', 'self', 'confidence']
['be', 'have', 'do', 'language', 'city', 'mice']
['it', 'hilarious', 'check', 'it', 'out']
['see', 'it', 'sunday', 'morning', 'at', '8', '30a', 'on', 'rtv6', 'and', 'our', 'rtv6', 'app', 'http', '…']


In [33]:
tweets = pd.read_csv("/content/tweets_train.csv", na_filter=False)
display(tweets.head())

Unnamed: 0,screen_name,text
0,GOP,RT @GOPconvention: #Oregon votes today. That m...
1,TheDemocrats,RT @DWStweets: The choice for 2016 is clear: W...
2,HillaryClinton,Trump's calling for trillion dollar tax cuts f...
3,HillaryClinton,.@TimKaine's guiding principle: the belief tha...
4,timkaine,Glad the Senate could pass a #THUD / MilCon / ...


In [34]:
def process_all(df, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    """ process all text in the dataframe using process() function.
    Inputs
        df: pd.DataFrame: dataframe containing a column 'text' loaded from the CSV file
        lemmatizer: an instance of a class implementing the lemmatize() method
                    (the default argument is of type nltk.stem.wordnet.WordNetLemmatizer)
    Outputs
        pd.DataFrame: dataframe in which the values of text column have been changed from str to list(str),
                        the output from process() function. Other columns are unaffected.
    """
    df['text'] = df['text'].apply(process)

    return df

In [35]:
processed_tweets = process_all(tweets)
print(processed_tweets.head())

#       screen_name                                               text
# 0             GOP  [rt, gopconvention, oregon, vote, today, that,...
# 1    TheDemocrats  [rt, dwstweets, the, choice, for, 2016, be, cl...
# 2  HillaryClinton  [trump, call, for, trillion, dollar, tax, cut,...
# 3  HillaryClinton  [timkaine, guide, principle, the, belief, that...
# 4        timkaine  [glad, the, senate, could, pass, a, thud, milc...

      screen_name                                               text
0             GOP  [rt, gopconvention, oregon, vote, today, that,...
1    TheDemocrats  [rt, dwstweets, the, choice, for, 2016, be, cl...
2  HillaryClinton  [trump, call, for, trillion, dollar, tax, cut,...
3  HillaryClinton  [timkaine, guide, principle, the, belief, that...
4        timkaine  [glad, the, senate, could, pass, a, thud, milc...


## B. Feature Construction

This step is to derive feature vectors from the tokenized tweets. In this section, we are  constructing a bag-of-words TF-IDF feature vector.

In [36]:
def create_features(processed_tweets, stop_words):
    """ creates the feature matrix using the processed tweet text
    Inputs:
        processed_tweets: pd.DataFrame: processed tweets read from train/test csv file, containing the column 'text'
        stop_words: list(str): stop_words by nltk stopwords (after processing)
    Outputs:
        sklearn.feature_extraction.text.TfidfVectorizer: the TfidfVectorizer object used
            we need this to tranform test tweets in the same way as train tweets
        scipy.sparse.csr.csr_matrix: sparse bag-of-words TF-IDF feature matrix
    """

    lis_stop_words=list(stop_words)

    tfidf_vector = sklearn.feature_extraction.text.TfidfVectorizer(
        stop_words=lis_stop_words,
        min_df=2,
        analyzer='word'
    )

    string_tweets = [" ".join(tokens) for tokens in processed_tweets['text']]
    mat = tfidf_vector.fit_transform(string_tweets)
    return tfidf_vector, mat


In [37]:

processed_stopwords = set(np.concatenate([process(word) for word in stopwords]))
(tfidf, X) = create_features(processed_tweets, processed_stopwords)
# Ignore warning
tfidf, X


(TfidfVectorizer(min_df=2,
                 stop_words=['other', 'be', 'itself', 'wouldn', 'm', 'an',
                             'hasn', 'but', 'such', 'shouldve', 'youll', 'youd',
                             'yourself', 'any', 'about', 'by', 'between', 'too',
                             'against', 'youre', 'she', 'wasn', 'down', 'ma',
                             'the', 'to', 'when', 'i', 'haven', 'ourselves', ...]),
 <17298x7915 sparse matrix of type '<class 'numpy.float64'>'
 	with 161708 stored elements in Compressed Sparse Row format>)



For each tweet, assigning a class label (0 or 1) using its `screen_name`. using 0 for realDonaldTrump, mike_pence, GOP and 1 for the rest.

In [38]:
def create_labels(processed_tweets):
    """ creates the class labels from screen_name
    Inputs:
        processed_tweets: pd.DataFrame: tweets read from train file, containing the column 'screen_name'
    Outputs:
        numpy.ndarray(int): dense binary numpy array of class labels
    """
    Republicans = {'realDonaldTrump', 'mike_pence', 'GOP'}

    class_labels_screen_name = processed_tweets['screen_name'].apply(lambda x: 0 if x in Republicans else 1)

    return class_labels_screen_name.to_numpy(dtype=np.int32)

In [39]:
y = create_labels(processed_tweets)
y


array([0, 1, 1, ..., 0, 1, 0], dtype=int32)

## Classification
Integrating things together and implementing a model for the classification of tweets.


To determine whether  classifier is performing well, comparing it to a baseline classifier.

In [40]:

class MajorityLabelClassifier():
    """
    A classifier that predicts the mode of training labels
    """
    def __init__(self):
        """
        Initialize your parameter here
        """
        self.most_frequent_class = None

    def fit(self, X, y):
        """
        Implement fit by taking training data X and their labels y and finding the mode of y
        i.e. store your learned parameter
        """
        self.most_frequent_class = np.argmax(np.bincount(y))


    def predict(self, X):
        """
        Implement to give the mode of training labels as a prediction for each data instance in X
        return labels
        """
        return [self.most_frequent_class for _ in range(X.shape[0])]

baselineClf = MajorityLabelClassifier()

baselineClf.fit(X, y)
y_pred = baselineClf.predict(X)
training_accuracy = sklearn.metrics.accuracy_score(y, y_pred)
print(training_accuracy)


0.5001734304543878


Implementing the `learn_classifier()` function  `kernel` is always one of {`linear`, `poly`, `rbf`, `sigmoid`}.

In [41]:
def learn_classifier(X_train, y_train, kernel):
    """ learns a classifier from the input features and labels using the kernel function supplied
    Inputs:
        X_train: scipy.sparse.csr.csr_matrix: sparse matrix of features, output of create_features()
        y_train: numpy.ndarray(int): dense binary vector of class labels, output of create_labels()
        kernel: str: kernel function to be used with classifier. [linear|poly|rbf|sigmoid]
    Outputs:
        sklearn.svm.SVC: classifier learnt from data
    """

    svm_classifier=sklearn.svm.SVC(kernel=kernel)
    svm_classifier.fit(X_train, y_train)
    return svm_classifier

In [42]:

classifier = learn_classifier(X, y, 'linear')

In [43]:
def evaluate_classifier(classifier, X_validation, y_validation):
    """ evaluates a classifier based on a supplied validation data
    Inputs:
        classifier: sklearn.svm.classes.SVC: classifer to evaluate
        X_validation: scipy.sparse.csr.csr_matrix: sparse matrix of features
        y_validation: numpy.ndarray(int): dense binary vector of class labels
    Outputs:
        double: accuracy of classifier on the validation data
    """
    y_pred_val = classifier.predict(X_validation)
    val_accuracy=sklearn.metrics.accuracy_score(y_validation,y_pred_val)
    return val_accuracy

In [44]:
accuracy = evaluate_classifier(classifier, X, y)
print(accuracy)
# should give around 0.9545034107989363

0.9512660423170308


##Function: Kernel Selection with Cross-Validation
This function implements 4-fold cross-validation to evaluate the performance of different SVM kernels (Linear, Polynomial, RBF, and Sigmoid) for the binary classification problem. It splits the training data into 75% training and 25% validation in each fold, trains SVM models using Bag-of-Words features, and calculates the average accuracy for each kernel to determine the best-performing classifier.


In [45]:
kf = sklearn.model_selection.KFold(n_splits=4, random_state=1, shuffle=True)
kf

KFold(n_splits=4, random_state=1, shuffle=True)

In [46]:
def best_model_selection(kf, X, y):
    """
    Select the kernel giving best results using k-fold cross-validation.
    Other parameters should be left default.
    Input:
    kf (sklearn.model_selection.KFold): kf object defined above
    X (scipy.sparse.csr.csr_matrix): training data
    y (array(int)): training labels
    Return:
    best_kernel (string)
    """
    avg_accuracy = {}

    for kernel in ['linear', 'rbf', 'poly', 'sigmoid']:
        accuracy_kernel = []
        for train_index, test_index in kf.split(X):
            X_train, y_train, X_test, y_test = X[train_index], y[train_index], X[test_index], y[test_index]
            svm_classifier = learn_classifier(X_train, y_train, kernel)
            model_accuracy = evaluate_classifier(svm_classifier, X_test, y_test)
            accuracy_kernel.append(model_accuracy)
        avg_accuracy[kernel] = np.mean(accuracy_kernel)

    best_accuracy_kernel = max(avg_accuracy, key=avg_accuracy.get)
    return best_accuracy_kernel
best_kernel = best_model_selection(kf, X, y)
best_kernel

'poly'

Wrapper function that will use model to classify unlabeled tweets from tweets_test.csv file.

In [47]:
def classify_tweets(tfidf, classifier, unlabeled_tweets):
    """ predicts class labels for raw tweet text
    Inputs:
        tfidf: sklearn.feature_extraction.text.TfidfVectorizer: the TfidfVectorizer object used on training data
        classifier: sklearn.svm.SVC: classifier learned
        unlabeled_tweets: pd.DataFrame: tweets read from tweets_test.csv
    Outputs:
        numpy.ndarray(int): dense binary vector of class labels for unlabeled tweets
    """

    string_tweets = [" ".join(process(tweet)) for tweet in unlabeled_tweets['text']]
    X_test = tfidf.transform(string_tweets)
    y_pred = classifier.predict(X_test)
    return y_pred

In [49]:
classifier = learn_classifier(X, y, best_kernel)
unlabeled_tweets = pd.read_csv("/content/tweets_test.csv", na_filter=False)
y_pred = classify_tweets(tfidf, classifier, unlabeled_tweets)
y_pred

array([1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,