# Profiling Irony and Stereotype Spreaders on Twitter
### Language Processing 2
##### Caroline Amalie Ørum-Hansen, Maja Mittag & Trine K. M. S. Engelund
_______________

### **Import functions and libraries**

In [1]:
# import our custom functions
from read_files import *
from feature_tranformers import *

# import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score, f1_score

### **Import data**

We import the tweets (X) and the true labels (y), and replace the values in y with dummy values.

0 = not ironic, 1 = ironic.

In [2]:
# import data
X, y = get_data()
y = np.where(y == 'NI', 0, 1) # place with dummy values

print(X.shape, y.shape)

(420, 200) (420,)


### **Split data**

We split data in 80% train and 20% test.

In [3]:
# split dataset in train and test
X_train, x_test, Y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print("Train shape: ", X_train.shape, "Test shape: ", x_test.shape)

Train shape:  (336, 200) Test shape:  (84, 200)


### **Features**

**Author level features**

In [4]:
# FeatureUnion of features at author level
author_features = [
  ('tfidf', TfidfVectorizer(analyzer='word', # char n-grams
                            ngram_range=(1,3), # use uni-, bi and trigrams
                            max_df = 0.90, # ignore terms that appear in more than 90% of the documents
                            min_df=0.01,)), # ignore terms that appear in less than 1% of the documents
  ('TTR', TTR()), # type-token ratio / lexical diversity
  ('average_word', avg_word()), # average word length
  ('average_char', avg_char()), # average char length
  ('spongebob', spongebob()) # Mocking Spongebob
]

author_features_combined = FeatureUnion(transformer_list=author_features, n_jobs=-1)
author_features_combined

FeatureUnion(n_jobs=-1,
             transformer_list=[('tfidf',
                                TfidfVectorizer(max_df=0.9, min_df=0.01,
                                                ngram_range=(1, 3))),
                               ('TTR', TTR()), ('average_word', avg_word()),
                               ('average_char', avg_char()),
                               ('spongebob', spongebob())])

In [5]:
# Pipeline for features at author level
author_pipe = Pipeline([
    ('preprocesser_author', preprocess()), # preprocess the tweets
    ('features', author_features_combined) # compute features
])

**Tweet level features**

In [6]:
# FeatureUnion of features at tweet level
tweet_features = [
  ('emoji_sentiment_diff', emoji_sentiment_diff()),
  ('sentiment_incongruity', sentiment_incongruity())
]

tweet_features_combined = FeatureUnion(transformer_list=tweet_features, n_jobs=-1)
tweet_features_combined

FeatureUnion(n_jobs=-1,
             transformer_list=[('emoji_sentiment_diff', emoji_sentiment_diff()),
                               ('sentiment_incongruity',
                                sentiment_incongruity())])

In [7]:
# Pipeline for features at tweet level
tweet_pipe = Pipeline([
    ('preprocesser_tweet', empty2dot()), # preprocess the tweets
    ('features', tweet_features_combined), # compute features
])

**Combine all features**

In [8]:
# combine all features
all_features = FeatureUnion(
    [
    ('authors_features', author_pipe), # features at author level
    ('tweet_features', tweet_pipe), # features at tweet level
    ('stylometric_counts', stylometric_counts()) # stylistic counts (also at author level)
    ],
    n_jobs=-1)
all_features

FeatureUnion(n_jobs=-1,
             transformer_list=[('authors_features',
                                Pipeline(steps=[('preprocesser_author',
                                                 preprocess()),
                                                ('features',
                                                 FeatureUnion(n_jobs=-1,
                                                              transformer_list=[('tfidf',
                                                                                 TfidfVectorizer(max_df=0.9,
                                                                                                 min_df=0.01,
                                                                                                 ngram_range=(1,
                                                                                                              3))),
                                                                                ('TTR',
                            

### **Pipeline**

**Initialize pipeline**

In [9]:
# initiate pipeline 
pipe = Pipeline([
    ('features', all_features), # compute features
    ('scaler', MaxAbsScaler()), # scale features
    ('classifier', SVC()), # run classifier (SVC is just a placeholder)
])

pipe

Pipeline(steps=[('features',
                 FeatureUnion(n_jobs=-1,
                              transformer_list=[('authors_features',
                                                 Pipeline(steps=[('preprocesser_author',
                                                                  preprocess()),
                                                                 ('features',
                                                                  FeatureUnion(n_jobs=-1,
                                                                               transformer_list=[('tfidf',
                                                                                                  TfidfVectorizer(max_df=0.9,
                                                                                                                  min_df=0.01,
                                                                                                                  ngram_range=(1,
                                 

### **Gridsearch**

**Parameter grid**

In [10]:
# parameter grid for classifiers
param_grid = [
    # SVM
    {
        'classifier': [SVC()],
        'classifier__kernel': ['linear'],
        'classifier__gamma': [0.1, 5],
        'classifier__C': [0.001, 1000]
    },

    # Random Forrest
    {
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [200, 300],
        'classifier__n_jobs': [-1]
    },

    # Logistic Regression
    {
        'classifier': [LogisticRegression()],
        'classifier__solver': ['liblinear'],
        'classifier__C': [0.01, 1, 100]
    }
]

**5-fold gridsearch**

In [11]:
# gridsearch
grid_search = GridSearchCV( pipe,
                            param_grid=param_grid,
                            cv=5,
                            scoring='accuracy', 
                            refit='accuracy', 
                            n_jobs=-1, 
                            return_train_score=True)
grid_search.fit(X_train, Y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('features',
                                        FeatureUnion(n_jobs=-1,
                                                     transformer_list=[('authors_features',
                                                                        Pipeline(steps=[('preprocesser_author',
                                                                                         preprocess()),
                                                                                        ('features',
                                                                                         FeatureUnion(n_jobs=-1,
                                                                                                      transformer_list=[('tfidf',
                                                                                                                         TfidfVectorizer(max_df=0.9,
                                                                

### **Results**

**Best parameters**

In [12]:
# print best parameters
grid_search.best_estimator_

Pipeline(steps=[('features',
                 FeatureUnion(n_jobs=-1,
                              transformer_list=[('authors_features',
                                                 Pipeline(steps=[('preprocesser_author',
                                                                  preprocess()),
                                                                 ('features',
                                                                  FeatureUnion(n_jobs=-1,
                                                                               transformer_list=[('tfidf',
                                                                                                  TfidfVectorizer(max_df=0.9,
                                                                                                                  min_df=0.01,
                                                                                                                  ngram_range=(1,
                                 

**Mean cross-validation accuracy of best model**

In [13]:
grid_search.best_score_

0.8808604038630378

**Train and test accuracy of best model**

In [14]:
print("Train accuracy:", grid_search.score(X_train,Y_train))
print("Test accuracy:", grid_search.score(x_test,y_test))

Train accuracy: 1.0
Test accuracy: 0.8928571428571429


**Results from each fold**

In [16]:
# save gridsearch results in dataframe
scoring_results = pd.DataFrame(grid_search.cv_results_)
print(scoring_results.columns)
scoring_results

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_classifier', 'param_classifier__C', 'param_classifier__gamma',
       'param_classifier__kernel', 'param_classifier__n_estimators',
       'param_classifier__n_jobs', 'param_classifier__solver', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score', 'split0_train_score',
       'split1_train_score', 'split2_train_score', 'split3_train_score',
       'split4_train_score', 'mean_train_score', 'std_train_score'],
      dtype='object')


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__C,param_classifier__gamma,param_classifier__kernel,param_classifier__n_estimators,param_classifier__n_jobs,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,236.719534,2.906942,52.474745,1.731977,SVC(),0.001,0.1,linear,,,...,0.868964,0.024232,4,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,230.718771,13.415928,54.650036,6.829754,SVC(),0.001,5.0,linear,,,...,0.868964,0.024232,4,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,220.953818,4.139174,54.228982,2.005914,SVC(),1000.0,0.1,linear,,,...,0.877875,0.029243,2,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,215.750901,21.997525,51.319964,1.881687,SVC(),1000.0,5.0,linear,,,...,0.877875,0.029243,2,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,221.354735,10.698988,53.100247,4.504647,"RandomForestClassifier(n_estimators=300, n_job...",,,,200.0,-1.0,...,0.863038,0.034743,7,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,218.500857,10.791893,51.896949,2.008467,"RandomForestClassifier(n_estimators=300, n_job...",,,,300.0,-1.0,...,0.88086,0.039126,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
6,211.294241,15.759943,52.588498,3.567639,LogisticRegression(),0.01,,,,,...,0.857112,0.026171,9,1.0,1.0,1.0,1.0,1.0,1.0,0.0
7,218.928298,11.383826,53.679711,2.642386,LogisticRegression(),1.0,,,,,...,0.860053,0.027955,8,1.0,1.0,1.0,1.0,1.0,1.0,0.0
8,200.300083,5.940365,43.07708,2.829122,LogisticRegression(),100.0,,,,,...,0.866023,0.031483,6,1.0,1.0,1.0,1.0,1.0,1.0,0.0
