In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns
import gensim


In [None]:
def merging_labels_and_sentences(textfile, labelfile):

    # read textfile and labelfile into two separate dataframes
    df_text = pd.read_csv(textfile + '.txt', header=None, skiprows = 0,
        names=['tweet'], sep='\t', quoting=3)
    df_labels = pd.read_csv(labelfile + '.txt',header=None, skiprows = 0, names=['label'],
        sep='\t', quoting=3)

    index_text = [x for x in range(1, len(df_text.values)+1)]

    df_labels.insert(loc=0, column='id', value =index_text)
    df_text.insert(loc=0, column='id', value =index_text)
    final_df = df_text.merge(df_labels, on='id', how='left')

    return final_df

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([np.mean([self.word2vec[w] 
        for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0) 
        for words in X])

def tokenize_train_test(X_train, X_test):
    #Word2Vec
    # Word2Vec runs on tokenized sentences
    X_train_tok= [nltk.word_tokenize(i) for i in X_train]  
    X_test_tok= [nltk.word_tokenize(i) for i in X_test]

        

In [None]:
data_test = merging_labels_and_sentences('datasets/offensive/test_text','datasets/offensive/test_labels')

data = merging_labels_and_sentences('datasets/offensive/train_text','datasets/offensive/train_labels')

data_val = merging_labels_and_sentences('datasets/offensive/val_text','datasets/offensive/val_labels')

In [None]:
pipeline = Pipeline([
    ('normalizer', StandardScaler()), #Step1 - normalize data
    ('clf', LogisticRegression()) #step2 - classifier
])
pipeline.steps

In [None]:
# #X_train, X_test, y_train, y_test = train_test_split(data['tweet'].values,
#                                                    data['label'],
#                                                    test_size = 0.4,
#                                                    random_state = 10)

In [None]:
# # # print(X_train.shape)
# # # print(X_test.shape)
# # # print(y_train.shape)
# # # print(y_test.shape)

In [None]:
print(X_train[0:2]) # tweets
print(X_test[0:2])  # tweets
print(y_train[0:2]) # labels
print(y_test[0:2]) # labels

In [None]:
# # (#W2v)
# X_train_tok=[nltk.word_tokenize(i) for i in X_train]
# X_test_tok=[nltk.word_tokenize(i) for i in X_test]
# model = gensim.models.Word2Vec(X_train_tok,min_count=1)
# w2v = dict(zip(model.wv.index_to_key, model.wv))      
# modelw = MeanEmbeddingVectorizer(w2v)

# # converting text to numerical data using Word2Vec
# X_train = modelw.transform(X_train_tok)
# X_test = modelw.transform(X_test_tok)

In [None]:
# from sklearn.model_selection import cross_validate

# scores = cross_validate(pipeline, X_train, y_train)
# scores

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

clfs = []
clfs.append(LogisticRegression())
clfs.append(SVC())
clfs.append(SVC())
clfs.append(KNeighborsClassifier(n_neighbors=3))
clfs.append(DecisionTreeClassifier())
clfs.append(RandomForestClassifier())
clfs.append(GradientBoostingClassifier())

for classifier in clfs:
    pipeline.set_params(clf = classifier)
    scores = cross_validate(pipeline, X_train, y_train)
    print('---------------------------------')
    print(str(classifier))
    print('-----------------------------------')
    for key, values in scores.items():
            print(key,' mean ', values.mean())
            print(key,' std ', values.std())

In [None]:
from sklearn.model_selection import GridSearchCV
pipeline.set_params(clf= SVC())
pipeline.steps


In [None]:


# cv_grid = GridSearchCV(pipeline, param_grid = {
#     'clf__kernel' : ['linear', 'rbf'],
#     'clf__C' : np.linspace(0.1,1.2,12)
# })

# cv_grid.fit(X_train, y_train)



In [None]:
cv_grid.best_estimator_

In [None]:
cv_grid.best_score_

In [None]:
y_predict = cv_grid.predict(X_test)
accuracy = accuracy_score(y_test,y_predict)
print('Accuracy of the best classifier after CV is %.3f%%' % (accuracy*100))

In [None]:
import re
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

#creating a function to encapsulate preprocessing, to mkae it easy to replicate on  submission data
def processing(df):
    #lowering and removing punctuation
    df['processed'] = df['tweet'].apply(lambda x: re.sub(r'[^\w\s]','', x.lower()))
    
    #numerical feature engineering
    #total length of sentence
    df['length'] = df['processed'].apply(lambda x: len(x))
    #get number of words
    df['words'] = df['processed'].apply(lambda x: len(x.split(' ')))
    df['words_not_stopword'] = df['processed'].apply(lambda x: len([t for t in x.split(' ') if t not in stopWords]))
    #get the average word length
    df['avg_word_length'] = df['processed'].apply(lambda x: np.mean([len(t) for t in x.split(' ') if t not in stopWords]) if len([len(t) for t in x.split(' ') if t not in stopWords]) > 0 else 0)
    #get the average word length
    df['commas'] = df['tweet'].apply(lambda x: x.count(','))

    return(df)

data_processed = processing(data)
data_processed.head()

In [None]:


from sklearn.model_selection import train_test_split

features= [c for c in data_processed.columns.values if c  not in ['id','tweet','label']]
numeric_features= [c for c in data_processed.columns.values if c  not in ['id','tweet','label','processed']]
target = 'label'

X_train, X_test, y_train, y_test = train_test_split(data_processed[features], data_processed[target], test_size=0.33, random_state=42)
X_train.head()



In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

text = Pipeline([
                ('selector', TextSelector(key='processed')),
                ('tfidf', TfidfVectorizer( stop_words='english'))
            ])

text.fit_transform(X_train)

In [None]:
from sklearn.preprocessing import StandardScaler

length =  Pipeline([
                ('selector', NumberSelector(key='length')),
                ('standard', StandardScaler())
            ])

x = length.fit_transform(X_train)

In [None]:
words =  Pipeline([
                ('selector', NumberSelector(key='words')),
                ('standard', StandardScaler())
            ])
words_not_stopword =  Pipeline([
                ('selector', NumberSelector(key='words_not_stopword')),
                ('standard', StandardScaler())
            ])
avg_word_length =  Pipeline([
                ('selector', NumberSelector(key='avg_word_length')),
                ('standard', StandardScaler())
            ])
commas =  Pipeline([
                ('selector', NumberSelector(key='commas')),
                ('standard', StandardScaler()),
            ])

In [None]:

from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([('text', text), 
                      ('length', length),
                      ('words', words),
                      ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas)])

feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)

In [None]:


from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(random_state = 42)),
])

pipeline.fit(X_train, y_train)

preds = pipeline.predict(X_test)
np.mean(preds == y_test)

In [None]:

from sklearn.model_selection import GridSearchCV

hyperparameters = { 'features__text__tfidf__max_df': [0.9, 0.95],
                    'features__text__tfidf__ngram_range': [(1,1), (1,2)],
                   'classifier__max_depth': [50, 70],
                    'classifier__min_samples_leaf': [1,2]
                  }
clf = GridSearchCV(pipeline, hyperparameters, cv=3)
 
# Fit and tune model
clf.fit(X_train, y_train)


In [None]:
clf.best_params_

In [None]:

#refitting on entire training data using best settings
clf.refit

preds = clf.predict(X_test)
probs = clf.predict_proba(X_test)

np.mean(preds == y_test)

In [None]:

data_test.to_csv('test_labels.csv')

In [None]:

submission =  pd.read_csv('test_labels.csv')
#preprocessing
submission = processing(submission)
predictions = clf.predict_proba(submission)

preds = pd.DataFrame(data=predictions, columns = clf.best_estimator_.named_steps['classifier'].classes_)

#generating a submission file
result = pd.concat([submission[['id']], preds], axis=1)
result.set_index('id', inplace = True)
result.head()
result.to_csv('final_prediction_test.csv')

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Put this when it's called
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# Ignore warning prints
import warnings



import pandas

## Reading in the data

- data: the train data set
- data_test: the test data set

## Functions for the classification pipeline

In [None]:
# Create table for missing data analysis
def draw_missing_data_table(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data

# Plot learning curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Validation score")

    plt.legend(loc="best")
    return plt

# Plot validation curve
def plot_validation_curve(estimator, title, X, y, param_name, param_range, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    train_scores, test_scores = validation_curve(estimator, X, y, param_name, param_range, cv)
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.plot(param_range, train_mean, color='r', marker='o', markersize=5, label='Training score')
    plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='r')
    plt.plot(param_range, test_mean, color='g', linestyle='--', marker='s', markersize=5, label='Validation score')
    plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='g')
    plt.grid() 
    plt.xscale('log')
    plt.legend(loc='best') 
    plt.xlabel('Parameter') 
    plt.ylabel('Score') 
    plt.ylim(ylim)


In [None]:
# Import data
df = data
df_raw = df.copy()  # Save original data set, just in case.
df_raw_val = data_val.copy()

In [None]:
df.head()

In [None]:
data_val.head()

In [None]:
df.info()

In [None]:
# Descriptive statistics
df.describe().to_csv('descriptive_statistics_trainingset.csv')



### There are three aspects that usually catch my attention when I analyse descriptive statistics:

- **Min and max values.** This can give us an idea about the range of values and is helpful to detect outliers. In our case, all the min and max values seem reasonable and in a reasonable range of values. The only exception could eventually be the max value of 'Fare', but for now we will leave it as it is.
- **Mean and standard deviation.** The mean shows us the central tendency of the distribution, while the standard deviation quantifies its amount of variation. For example, a low standard deviation suggests that data points tend to be close to the mean. Giving a quick look to our values, there's nothing that looks like obviously wrong.
- **Count.** This is important to give us a first perception about the volume of missing data. Here, we can see that some 'Age' data is missing.


In [None]:
# Analyse missing data
draw_missing_data_table(df)

In [None]:


# Data types
df.dtypes



In [None]:
# df data consists of data from the training set only, which are then split into 'train' and 'test'

X = df['tweet']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)


In [None]:
# Debug
print('Inputs: \n', X_train.head())
print('Outputs: \n', y_train.head())

In [None]:
# (tf-Idf)
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)


In [None]:
# Fit logistic regression
logreg = LogisticRegression(max_iter = 1000, solver='lbfgs')
logreg.fit(X_train_vectors_tfidf, y_train)

In [None]:
# Model performance
scores = cross_val_score(logreg, X_train_vectors_tfidf, y_train, cv=3)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))


In [None]:
# Assessing model performance
# Plot learning curves
title = "Learning Curves (Logistic Regression)"
cv = 3
plot_learning_curve(logreg, title, X_train_vectors_tfidf, y_train, ylim=(0.63, 0.95), cv=cv, n_jobs=3);

***

**Discussion of results**

- We see a large gab between the training score and the validation score.

- _**Thus, the learning curve of the training score and the validation score shows an overfitting in the model.**_ 

- A solution for the overfitting would be:
    1. Reduce the complexity of the model and/or
    2. Collect more data

- The final score is 0.749, which means our model does better predictions than a flip-a-coin strategy, but is still far from being an useful model.

***



**Learning curves in a nutshell:**

- Learning curves allow us to _diagnose_ if the is _**overfitting**_ or _**underfitting**_.

- When the _**model overfits**_, it means that it _**performs well on the training set, but not not on the validation set**_. Accordingly, the model is not able to generalize to unseen data. If the model is overfitting, the learning curve will present a gap between the training and validation scores. Two common solutions for overfitting are reducing the complexity of the model and/or collect more data.

- On the other hand, _**underfitting means that the model is not able to perform well in either training or validations sets**_. In those cases, the _**learning curves will converge to a low score value**_. When the model underfits, _gathering more data_ is _**not helpful**_ because the _**model**_ is already _**not**_ being able to _**learn the training data**_. 

---------------------------------------------------------------------
** <center> Therefore, the best approaches for these cases are to improve the model (e.g., tuning the hyperparameters) or to improve the quality of the data (e.g., collecting a different set of features).</center>**

***


In [None]:
# Plot validation curve
title = 'Validation Curve (Logistic Regression)'
param_name = 'C' 
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] 
cv = 3
test = plot_validation_curve(estimator=logreg, title=title, X=X_train_vectors_tfidf, y=y_train, param_name=param_name,
                      ylim=(0.5, 1.01), param_range=param_range);


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

C_param_range = [0.001,0.01,0.1,1,10,100]

df_test = pd.DataFrame(columns = ['C_parameter'])
df_test['C_parameter'] = C_param_range

plt.figure(figsize=(10, 10))

j = 0
for i in C_param_range:
    
    # Apply logistic regression model to training data
    lr = LogisticRegression(penalty = 'l2', C = i,random_state = 0)
    lr.fit(X_train_vectors_tfidf,y_train)
    
    # Predict using model
    y_pred_sepal = lr.predict(X_test_vectors_tfidf)
    
    # Saving accuracy score in table
    df_test = accuracy_score(y_test,y_pred_sepal)
    j += 1
    
    # Printing decision regions
    plt.subplot(3,2,j)
    plt.subplots_adjust(hspace = 0.4)
    plot_validation_curve(estimator=logreg, title=title, X=X_train_vectors_tfidf, y=y_train, param_name=param_name,
                      ylim=(0.5, 1.01), param_range=param_range);
    plt.title('C = %s'%i)



**Validation curves in a nutshell:**

Validation curves are a tool that we can use to improve the performance of our model. It counts as a way of tuning our hyperparameters.
They are different from the learning curves. Here, the goal is to see how the model parameter impacts the training and validation scores. This allow us to choose a different value for the parameter, to improve the model.
Once again, if there is a gap between the training and the validation score, the model is probably overfitting. In contrast, if there is no gap but the score value is low, we can say that the model underfits.

**Discussion of our results:**

_The figure shows that there is a huge difference in model's performance. Note that in a logistic regression, C is the only model parameter that we can change (see scikit-learn documentation)._ 

## Splitting into: 
#### Training and Test from the two provided text files for training data set and test data set

_The previous results of model performance was executed from the training data set only, from which was split into a training and the testing part. We want to explore if that makes any difference for the same results if we approach same functions with the training and the test data set instead_ 
***

In [None]:
X_train_ = data['tweet']
X_test_ = data_test['tweet']
y_train_ = data['label']
y_test_ = data_test['label']

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer #for word embedding

# (tf-Idf)
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf_ = tfidf_vectorizer.fit_transform(X_train_) 
X_test_vectors_tfidf_ = tfidf_vectorizer.transform(X_test_)


In [None]:
# Debug
print('Inputs: \n', X_train_.head())
print('Outputs: \n', y_train_.head())

In [None]:
# Fit logistic regression
logreg_ = LogisticRegression(max_iter = 1000, solver='lbfgs')
logreg_.fit(X_train_vectors_tfidf_, y_train_)

In [None]:
# Model performance
scores_train = cross_val_score(logreg_, X_train_vectors_tfidf_, y_train_, cv=3)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores_train), np.std(scores_train)))


In [None]:
# Assessing model performance
# Plot learning curves
title = "Learning Curves (Logistic Regression)"
cv = 3
plot_learning_curve(logreg_, title, X_train_vectors_tfidf_, y_train_, ylim=(0.6, 0.85), cv=cv, n_jobs=3);

In [None]:
from matplotlib.colors import ListedColormap

def plot_decision_regions(X,y,classifier, test_idx = None,resolution=0.02):
    #print(X, y)
    
    # Initialise the marker types and colors
    markers = ('s','x','o','^','v')
    colors = ('red','blue','lightgreen','gray','cyan')
    color_Map = ListedColormap(colors[:len(np.unique(y))]) #we take the color mapping correspoding to the 
                                                            #amount of classes in the target data
    
    # Parameters for the graph and decision surface
    x1_min = X[:,0].min() - 1
    x1_max = X[:,0].max() + 1
    x2_min = X[:,1].min() - 1
    x2_max = X[:,1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min,x1_max,resolution),
                           np.arange(x2_min,x2_max,resolution))
    
    #Z = classifier.predict(np.array([xx1.ravel(),xx2.ravel()]).T)
    #Z = Z.reshape(xx1.shape)
    
    # plt.contour(xx1,xx2,alpha=0.4,cmap = color_Map)
    # plt.xlim(xx1.min(),xx1.max())
    # plt.ylim(xx2.min(),xx2.max())
    
    # Plot samples
    X_test, Y_test = X[test_idx,:], y[test_idx]
    print(X_test, Y_test)
    
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x = X[y == cl, 0], y = X[y == cl, 1],
                    alpha = 0.8, c = color_Map(idx),
                    marker = markers[idx], label = cl
                   )

In [None]:
print("# training samples : ", len(X_train_))
print("# testing samples : ", len(y_train_))


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

C_param_range = [0.001,0.01,0.1,1,10,100]

df_test = pd.DataFrame(columns = ['C_parameter'])
df_test['C_parameter'] = C_param_range
plt.figure(figsize=(10, 10))

j = 0
for i in C_param_range:
    
    # Apply logistic regression model to training data
    lr = LogisticRegression(penalty = 'l2', C = i,random_state = 0)
    lr.fit(X_train_vectors_tfidf_,y_train_)
    
    # Predict using model
    y_pred_sepal_ = lr.predict(X_test_vectors_tfidf_)
    
    # Saving accuracy score in table
    df_test = accuracy_score(y_test_, y_pred_sepal_)
    j += 1
    
    # Printing decision regions
    plt.subplot(3,2,j)
    plt.subplots_adjust(hspace = 0.4)
    
    plot_validation_curve(estimator=logreg_, title=title, X=X_train_vectors_tfidf_, y=y_train_, param_name=param_name,
                      ylim=(0.5, 1.01), param_range=param_range);
    plt.title('C = %s'%i)

In [None]:
# Plot validation curve
title = 'Validation Curve (Logistic Regression)'
param_name = 'C'
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] 
cv = 3
plot_validation_curve(estimator=logreg_, title=title, X=X_train_vectors_tfidf_, y=y_train_, param_name=param_name,
                      ylim=(0.5, 1.01), param_range=param_range);






**Validation curves in a nutshell:**

Validation curves are a tool that we can use to improve the performance of our model. It counts as a way of tuning our hyperparameters.
They are different from the learning curves. Here, the goal is to see how the model parameter impacts the training and validation scores. This allow us to choose a different value for the parameter, to improve the model.
Once again, if there is a gap between the training and the validation score, the model is probably overfitting. In contrast, if there is no gap but the score value is low, we can say that the model underfits.

**Discussion of our results:**

_The figure shows that there is no huge difference in model's performance as far as we choose a C value of 10−1 or higher. Note that in a logistic regression, C is the only model parameter that we can change (see scikit-learn documentation)._ 

## **Summary**

There is no major difference from: 
- splitting the training data set into a training and a testing part or,
- splitting into a training set from the provided training data and into a testing set from the provided testing data.

***
***

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import validation_curve

#warnings.warn(CV_Warning, category=FutureWarning)


### 1. Use of validation curves for both datasets.
C_param_range = [0.001,0.01,0.1,1,10,100,1000]

plt.figure(figsize=(15, 10))

# Apply logistic regression model to training data
lr = LogisticRegression(penalty='l2',C = i,random_state = 0)

# Plot validation curve
train_scores, test_scores = validation_curve(estimator=lr
                                                            ,X=X_train_vectors_tfidf_
                                                            ,y=y_train_
                                                            ,param_name='C'
                                                            ,param_range=C_param_range
                                                            )

train_mean = np.mean(train_scores,axis=1)
train_std = np.std(train_scores,axis=1)
test_mean = np.mean(test_scores,axis=1)
test_std = np.std(test_scores,axis=1)

plt.subplot(2,2,1)
plt.plot(C_param_range
            ,train_mean
            ,color='blue'
            ,marker='o'
            ,markersize=5
            ,label='training accuracy')
    
plt.plot(C_param_range
            ,test_mean
            ,color='green'
            ,marker='x'
            ,markersize=5
            ,label='test accuracy') 
    
plt.xlabel('C_parameter')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.5,1.5])






## 

In [None]:
# Select features
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest



In [None]:
warnings.simplefilter(action='ignore', category=FutureWarning)
# Disables ConvergenceWarning
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses

import sys

## Get score using original model
logreg_ = LogisticRegression(C=1, solver='lbfgs')
logreg_.fit(X_train_vectors_tfidf_, y_train_)
scores = cross_val_score(logreg_, X_train_vectors_tfidf_, y_train_, cv=3)
with open('CV_accuracy_orginal.txt', 'w') as o:

    print('CV accuracy (original): %.3f +/- %.3f' % (np.mean(scores), 
    np.std(scores)), file=o)
    o.close()
    highest_score = np.mean(scores)

 ## Get score using models with feature selection
for i in range(1, 50, 1):
    # Select i features
    select = SelectKBest(score_func=chi2, k=i)
    select.fit(X_train_vectors_tfidf_, y_train_)
   
    X_train_poly_selected = select.transform(X_train_vectors_tfidf_)

    # Model with i features selected
    logreg_.fit(X_train_poly_selected, y_train_)
    scores = cross_val_score(logreg_, X_train_poly_selected, y_train_,  
    cv=3)
    with open('CV_accuracy.txt', 'w') as f:
        print('CV accuracy (number of features = %i): %.3f +/- %.3f' %
        (i, np.mean(scores), np.std(scores)), file=f)
                                                                
                                                                

        # Save results if best score
        if np.mean(scores) > highest_score:
            highest_score = np.mean(scores)
            std = np.std(scores)
            k_features_highest_score = i
        elif np.mean(scores) == highest_score:
            if np.std(scores) < std:
                highest_score = np.mean(scores)
                std = np.std(scores)
                k_features_highest_score = i
     
    f.close()

 
print('Number of features when highest score: %i' % k_features_highest_score)


# FIT MODEL FOR BEST FEATURE COMBINATION

_The Highly Accurate Model_

In [None]:
# Select features
select = SelectKBest(score_func=chi2, k=k_features_highest_score)
select.fit(X_train_vectors_tfidf_, y_train_)

# Fit model
logreg = LogisticRegression(C=1) #C = 1 which is on the edge of overfitting the model
logreg.fit(X_train_poly_selected, y_train_)

# Model performance
scores = cross_val_score(logreg_, X_train_poly_selected, y_train_, cv=3)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

In [None]:
# Plot learning curves
title = "Learning Curves (Logistic Regression)"
cv = 3
plot_learning_curve(logreg_, title, X_train_poly_selected, 
                    y_train_, ylim=(0.65, 0.8), cv=cv, n_jobs=3);



### **Results**
***

The new model shows no signs of overfitting or underfitting between the training score and validation score

In [None]:
# Plot validation curve

from sklearn.utils.testing import ignore_warnings

# Disables ConvergenceWarning
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses


title = 'Validation Curve (Logistic Regression)'
param_name = 'C'
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] 
cv = 3
plot_validation_curve(estimator=logreg_, title=title, X=X_train_poly_selected, y=y_train_, 
                      param_name=param_name, ylim=(0.6, 0.85), param_range=param_range);



### **Results**

***
The C parameter is set to 1 which is on the edge of overfitting. There is also no signs of underfitting the model as the model performs well (no gaps between the two scores)

## Submit Predictions 

In [None]:
# Get test data set
df_test = data_test

# copy of data_test
df_test_raw = df_test.copy()

In [None]:
# Make predictions
predictions = logreg.predict(X_train_poly_selected)
predictions


# Generate submission file
submission = pd.DataFrame({ 'tweet': X_train_,
                            'predicted': predictions})

#submission.to_csv("submission.csv", index=False)

In [None]:
print('Number of tweets predicted as offensive:', sum(submission['predicted'] == 1))
print('Number of tweets predicted as non-offensive:', sum(submission['predicted'] == 0))


## Conclusion

The aim of this sections was to improve data quality through feature extraction techniques and exploratory data analysis. Thus, explored different techniques to improve our model performance and data.


In [None]:
data_processed = processing(data)
data_test_processed = processing(data_test)
data_processed.head()
data_test_processed.head()

In [None]:

from sklearn.model_selection import train_test_split

features= [c for c in data_processed.columns.values if c  not in ['id','tweet','label']]
numeric_features= [c for c in data_processed.columns.values if c  not in ['id','tweet','label','processed']]
target = 'label'

features_test= [c for c in data_test_processed.columns.values if c  not in ['id','tweet','label']]
numeric_features= [c for c in data_test_processed.columns.values if c  not in ['id','tweet','label','processed']]
target_test = 'label'



X_train_pro = data_processed[features] 
X_test_pro = data_test_processed[features_test]
y_train_pro = data_processed[target]
y_test_pro = data_test_processed[target_test]


X_train_pro.head()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

text = Pipeline([
                ('selector', TextSelector(key='processed')),
                ('tfidf', TfidfVectorizer( stop_words='english'))
            ])

text.fit_transform(X_train_pro)

In [None]:
words =  Pipeline([
                ('selector', NumberSelector(key='words')),
                ('standard', StandardScaler())
            ])
words_not_stopword =  Pipeline([
                ('selector', NumberSelector(key='words_not_stopword')),
                ('standard', StandardScaler())
            ])
avg_word_length =  Pipeline([
                ('selector', NumberSelector(key='avg_word_length')),
                ('standard', StandardScaler())
            ])
commas =  Pipeline([
                ('selector', NumberSelector(key='commas')),
                ('standard', StandardScaler()),
            ])

from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([('text', text), 
                      ('length', length),
                      ('words', words),
                      ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas)])

feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train_pro)


In [None]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(random_state = 42)),
])

pipeline.fit(X_train_pro, y_train_pro)

preds_pro = pipeline.predict(X_test_pro)

print(np.mean(preds_pro == y_test_pro))




In [None]:
# Hyperparameters
from sklearn.model_selection import GridSearchCV

hyperparameters_pro = { 'features__text__tfidf__max_df': [0.9, 0.95],
                    'features__text__tfidf__ngram_range': [(1,1), (1,2)],
                   'classifier__max_depth': [50, 70],
                    'classifier__min_samples_leaf': [1,2]
                  }
clf_pro = GridSearchCV(pipeline, hyperparameters_pro, cv=3)
 
# Fit and tune model
clf_pro.fit(X_train_pro, y_train_pro)

In [None]:
clf_pro.best_params_



In [None]:
#refitting on entire training data using best settings
clf_pro.refit

preds_clf_pro = clf_pro.predict(X_test_pro)
probs_clf_pro = clf_pro.predict_proba(X_test_pro)

np.mean(preds_clf_pro == y_test_pro)

data.to_csv('prediction_train_processed.csv')

In [None]:
submission_pro =  pd.read_csv('prediction_train_processed.csv')

#preprocessing
submission_pro = processing(submission_pro)
predictions_pro = clf_pro.predict_proba(submission_pro)

preds_pro_df = pd.DataFrame(data=predictions_pro, columns = clf_pro.best_estimator_.named_steps['classifier'].classes_)

#generating a submission file
result = pd.concat([submission_pro[['id']], preds_pro_df], axis=1)
result.set_index('id', inplace = True)
result.head()
result.to_csv('final_prediction_train.csv')

In [None]:
from sklearn.model_selection import cross_validate

scores = cross_validate(pipeline, X_train_pro, y_train_pro)
scores

In [None]:
pipeline = Pipeline([
    ('normalizer', StandardScaler(with_mean=False)), #Step1 - normalize data
    ('clf', LogisticRegression()) #step2 - classifier
])
pipeline.steps

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf  = TfidfVectorizer()
X_train_pro_tf = tfidf.fit_transform(X_train_processed)
X_test_pro_tf = tfidf.fit_transform(X_test_processed)

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

clfs = []
clfs.append(LogisticRegression())
clfs.append(SVC())
clfs.append(SVC())
clfs.append(KNeighborsClassifier(n_neighbors=3))
clfs.append(DecisionTreeClassifier())
clfs.append(RandomForestClassifier())
clfs.append(GradientBoostingClassifier())

with open('descriptive_statistics_classifiers.csv', 'w') as f:
    for classifier in clfs:
        pipeline.set_params(clf = classifier)
        scores = cross_validate(pipeline, X_train_pro, y_train_pro)
        print('---------------------------------', file=f)
        print(str(classifier), file=f)
        print('-----------------------------------', file=f)
        for key, values in scores.items():
                print(key,' mean ', values.mean(), file=f)
                print(key,' std ', values.std(), file=f)
    f.close()