# CL PROJECT (Cleaning data)

## Execute this only once

In [47]:
!pip install pandas
!pip install nltk
!pip install ipdb
!pip install numpy
!pip install scipy
!pip install sklearn

Collecting scipy
  Downloading scipy-0.19.1-cp35-cp35m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (16.1MB)
[K    100% |████████████████████████████████| 16.1MB 50kB/s eta 0:00:01
Installing collected packages: scipy
Successfully installed scipy-0.19.1


## Importing necessary libraries

In [158]:
import pandas as pd
import nltk
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from IPython.core.debugger import set_trace #in order to do debug
import os #in order to make sounds

# Creating necessary functions

In [3]:
def bip():
    os.system("printf '\a'")

## Loading data

In [15]:
df = pd.read_csv('./tweets.csv', sep=';',encoding='latin-1')
del df['Profile Image']
df = df.rename(columns={'tweet text':'Tweet text - no links', 'Positive for CL? 1=Yes, 0=No': 'category'});

In [16]:
df.head()

Unnamed: 0,Date,Screen Name,Full Name,Tweet Text,Tweet ID,App,Followers,Follows,Retweets,Favorites,Verfied,User Since,Location,Bio,Google Maps,Tweet text - no links,eq?,random,category
0,26.05.17 14:09,@JLithium,Mateo H. Sanclemente,@jorgepatarroyo1 @ClaudiaLopez Aquel que lucha...,868212518879211000,Twitter for iPhone,115,239,0,0,No,30.10.2010,"Bogot, Colombia",I've got Nothing to lose!,,@jorgepatarroyo1 @claudialopez aquel que lucha...,FALSCH,1.47796,1
1,30.05.17 13:38,@simedi21,cecilia lozano,RT @PerrofantasmaBO: @WRadioColombia @ClaudiaL...,869654185729556000,Twitter for iPhone,402,250,1,0,No,18.04.2014,,,,rt @perrofantasmabo: @wradiocolombia @claudial...,FALSCH,5.38497,0
2,27.05.17 20:47,@gualo1001,oswaldo trejos rios,@Camilabaron26 @galloalex23 @ChavezTrump @Vick...,868675105613860000,Twitter Web Client,877,1691,0,0,No,02.12.2009,colombia,ahora sigue la batalla mas dura el ascenso de ...,,@camilabaron26 @galloalex23 @chaveztrump @vick...,FALSCH,184607.0,0
3,30.05.17 13:37,@simedi21,cecilia lozano,@WRadioColombia @ClaudiaLopez LAS CONFISCADAS ...,869654122408136000,Twitter for iPhone,402,250,0,0,No,18.04.2014,,,,@wradiocolombia @claudialopez las confiscadas ...,FALSCH,467444.0,0
4,25.05.17 16:59,@gusgomez1701,GustavoGmezCrdoba,@andresdiaz8527 @LaLuciernaga @DianaCalderonF ...,867892970594467000,Twitter Web Client,404299,8689,0,0,No,16.02.2011,,Repblica. Imperio de la ley. Democracia. Libr...,,@andresdiaz8527 @laluciernaga @dianacalderonf ...,FALSCH,719656.0,1


##### show any NaN in category

In [17]:
df[df['category'].isnull()]

Unnamed: 0,Date,Screen Name,Full Name,Tweet Text,Tweet ID,App,Followers,Follows,Retweets,Favorites,Verfied,User Since,Location,Bio,Google Maps,Tweet text - no links,eq?,random,category


##### showing unique values of category

In [18]:
df['category'].unique()

array([1, 0])

In [19]:
df.describe()

Unnamed: 0,Tweet ID,Followers,Follows,Retweets,Favorites,category
count,6250.0,6250.0,6250.0,6250.0,6250.0,6250.0
mean,8.693529e+17,5655.797,760.284,7.29744,0.49712,0.30176
std,1390618000000000.0,90762.04,1887.971456,98.302997,4.577395,0.459058
min,8.675255e+17,0.0,0.0,0.0,0.0,0.0
25%,8.679492e+17,43.0,111.0,0.0,0.0,0.0
50%,8.69272e+17,141.0,282.0,0.0,0.0,0.0
75%,8.705842e+17,467.75,699.75,0.0,0.0,1.0
max,8.718893e+17,3207143.0,45714.0,3729.0,164.0,1.0


# Feature Extraction

## Cleanup

### Removing emojis punctuation, digits and accents

removing:
* @(),.;:-_!¡?¿&"'=#$\%/*+\{\}\[\] and unecessary spaces
* numbers
* áéíóú and also àèìòù
* Change emojis to avoid replacement

regex: `[^a-zA-Z ]+`

In [20]:
#df['clean-text'] = df['Tweet text - no links'].str.lower() \
#    .str.replace(r'\(|\)|,|\.|;|\:|-|_|!|¡|\?|\¿|@|&|\"|\'|\=|#|$|%|/|\*|\+|\{|\}|\[|\]', '') \

#Creating function
def leave_only_letters(series):
    #changing emojis
    #TODO
    
    #Removing puntuation
    series = series.str.lower().str.replace(r'[^a-zA-Z ]+','') \
        .str.replace(r' {2,}',' ')
        
    #Removing digits
    series = series.str.replace(r'\d','')
    
    #Removing accents
    series = series.str.replace(r'á|à','a') \
        .str.replace(r'é|è','e') \
        .str.replace(r'í|ì','i') \
        .str.replace(r'ó|ò','o') \
        .str.replace(r'ú|ù','u')
        
    return series
        
df['clean-text'] = leave_only_letters(df['Tweet text - no links'])


### Removing stop words

such as 'en', 'la', 'el', etc

In [21]:
def remove_stop_words(series):
    stop = nltk.corpus.stopwords.words('spanish')
    return series.apply(lambda text : ' '.join([word for word in text.split() if word not in stop]))

df['clean-text'] = remove_stop_words(df['clean-text'])

### Stemming words

Using `SnowballStemmer` from `nltk`

In [22]:
def stem(series):
    stemmer = nltk.stem.snowball.SnowballStemmer('spanish')
    return series.apply(lambda text: ' '.join([stemmer.stem(word) for word in text.split()]))

df['clean-text'] = stem(df['clean-text'])

# Feature Engineering (Categorical Proportional Differences)

The idea here is to create the features which will be used in the SVM algorithm, but there are some problems,

## Problems
### Problems not solved
1. for example `[clau, claudia, claud, claudialopez]` or between `[colom, colomb, colombi]` are recognized as different words, it would be nice to merge all those words, there is a process similar to stem process which conserves the readibility of the words, it could work
2. Some words have a CPD of 1 or -1, that's because are weird words that only appear twice or three times around all examples, therefore will have a very high CPD
3. There are some examples where a feature does not appear nor in A or in B, for example `gallinaz` or `otroshabl`

### Problems solved
* removing one or two letter words


## Creating list of features (i.e. words)

In [23]:
all_features = set()

In [24]:
df['clean-text'].str.lower().str.split().apply(all_features.update);

### Removing one or two letter words

In [25]:
all_features = set([word for word in all_features if len(word) > 3]);

### Creating data frame from which to create the CPD

the values A,B,C,D (undercased) will be based on the following table

 | $c$ | $\neg c$
---|---|---
$w$ | A | B
$\neg w$ | C | D

Where $c$ is if `category` is true (i.e. is positive) and $w$ is whether the `specific feature` is in the tweet or not

In [26]:
cpd = pd.DataFrame(index=all_features, columns=['a','b','c','d','total','cpd','wcpd','abs-wcpd'])
cpd = cpd.fillna(0)

### Creating functions needed in the algorithm

In [27]:
def count(feature, cpd, df):
    #cpd[feature.name]['a']
    contain = df['category'][df['clean-text'].str.contains(feature.name) == True]
    not_contain = df['category'][df['clean-text'].str.contains(feature.name) == False]
    
    #set_trace()
    contain_count = contain.count()
    not_contain_count = not_contain.count()
    contain_positive = contain[contain == 1].count()
    not_contain_positive = not_contain[not_contain == 1].count()
    
    cpd.loc[feature.name,'a'] = contain_positive
    cpd.loc[feature.name,'b'] = contain_count - contain_positive
    cpd.loc[feature.name,'c'] = not_contain_positive
    cpd.loc[feature.name,'d'] = not_contain_count - not_contain_positive
    
    
    cpd.loc[feature.name,'total'] = cpd.loc[feature.name,'d'] + cpd.loc[feature.name,'c'] + cpd.loc[feature.name,'b'] + cpd.loc[feature.name,'a']
    
    if cpd.loc[feature.name,'a'] + cpd.loc[feature.name,'b'] == 0:
        cpd.loc[feature.name,'cpd'] = np.nan
    else:
        cpd.loc[feature.name,'cpd'] = (cpd.loc[feature.name,'a'] - cpd.loc[feature.name,'b']) / (cpd.loc[feature.name,'a']+cpd.loc[feature.name,'b'])


In [28]:
#%time cpd[:].apply(count, axis=1, raw=False, cpd=cpd, df=df);
cpd[:].apply(count, axis=1, raw=False, cpd=cpd, df=df);

bip()

### Select the most important features
It depends on the policy used, tentatively I will use Weighted Categorial Proportional Difference (WCPD), what WCPD does is that weight the CPD index by the number of times that the word appears, that is $(A+B)/(A+B+C+D)$ now that words that have a very high CPD but participate almost never in the examples won't be taken into account.

#### Running WCPD

In [29]:
cpd['wcpd'] = cpd['cpd']*(cpd['a']+cpd['b'])/cpd['total'];
cpd['abs-wcpd'] = cpd['wcpd'].abs();

In [30]:
cpd.sort_values('abs-wcpd', ascending=False, inplace=True)

### Create a dataframe 
with columns 'tweet id', [features*] and with category and then divide them into the $X$ and $y$ sets. Lets take the same the first 1000 features with the highest wcpd

#### What is shoud do is:
* Create a df with the id of the tweet and as columns the 1000 highest wcpd features (i.e. words)
* Get the ids of the tweets (the examples)
* For each feature say whether the tweet has or no that feature and add it to the new df

In [48]:
df_rf = pd.DataFrame(index=df['Tweet ID'], columns=cpd[:1000].index)

##### Function in order to populate df_rf

In [49]:
def populator(feature, df, df_rf):
    contains = pd.Series(df['clean-text'].apply(lambda x : feature.name in x).values,index=df['Tweet ID'])
    df_rf.loc[:,feature.name] = contains

##### Populating

In [51]:
df_rf.sort_index(inplace=True, ascending=True);
df.sort_values('Tweet ID', inplace=True, ascending=True);

df_rf.apply(populator, axis=0, raw=False, df=df, df_rf=df_rf);
bip()

In [52]:
df_rf.head()

Unnamed: 0_level_0,claudi,clau,claud,audi,claudialopez,lopez,petr,gust,gustav,juan,...,tont,williamfarfanm,beln,pierd,cafesalud,mald,danielrm,histori,adel,simontrinifarb
Tweet ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
867525452486897000,True,True,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
867525472112050000,True,True,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
867525505570000000,True,True,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
867525520036163000,True,True,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
867525558728568000,True,True,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Random Forest
Run a random forest in order to get up to 10.000 training examples, only accept new examples with 90%+ certanty

## Running RF

In [53]:
Xrf_train, Xrf_test, yrf_train, yrf_test = train_test_split(df_rf, df['category'])

rf_clf = RandomForestClassifier().fit(Xrf_train, yrf_train)

#Scores
print("Accurancy of RF classifier on training set: {:.2f}"
     .format(rf_clf.score(Xrf_train, yrf_train)))
print("Accurancy of RF classifier on test set: {:.2f}"
     .format(rf_clf.score(Xrf_test, yrf_test)))

bip()

Accurancy of RF classifier on training set: 0.97
Accurancy of RF classifier on test set: 0.74


# Construct the new df
containing the 10.000 examples

## Loading the new csv

In [61]:
new_df = pd.read_csv('./first_round_depured.csv', sep=';', encoding='latin-1')
new_df.describe()

Unnamed: 0,Tweet ID,Followers,Follows,Retweets,Favorites,#@
count,53213.0,53213.0,53213.0,53213.0,53213.0,53213.0
mean,8.58913e+17,6715.676,1435.576551,370.073,0.012591,0.883543
std,19466160000000.0,125360.8,8021.029229,6783.045,0.162525,1.046478
min,8.58853e+17,0.0,0.0,0.0,0.0,0.0
25%,8.588917e+17,159.0,181.0,0.0,0.0,0.0
50%,8.589069e+17,433.0,397.0,1.0,0.0,1.0
75%,8.589362e+17,1106.0,873.0,37.0,0.0,1.0
max,8.589523e+17,15371040.0,363364.0,1406432.0,22.0,13.0


## Cleaning text

In [62]:
new_df['clean-text'] = leave_only_letters(new_df['tweet-no-urls']);
new_df['clean-text'] = remove_stop_words(new_df['clean-text']) 
new_df['clean-text'] = stem(new_df['clean-text'])
bip()

## Checking if allright

In [63]:
new_df[new_df['clean-text'].isnull()]

Unnamed: 0,Date,Screen Name,Full Name,Tweet Text,Tweet ID,App,Followers,Follows,Retweets,Favorites,Verfied,User Since,Location,Bio,Google Maps,tweet-no-urls,rt?,#@,original,clean-text


### Creating new X df

In [64]:
Xnew = pd.DataFrame(index=new_df['Tweet ID'], columns=cpd[:1000].index)

In [95]:
#Remove the :100
new_df.sort_values('Tweet ID', inplace=True, ascending=True);
Xnew.sort_index(inplace=True, ascending=True);

Xnew[:100].apply(populator, axis=0, raw=False, df=new_df[:100], df_rf=Xnew);
bip()

In [96]:
Xnew.head()

Unnamed: 0_level_0,claudi,clau,claud,audi,claudialopez,lopez,petr,gust,gustav,juan,...,tont,williamfarfanm,beln,pierd,cafesalud,mald,danielrm,histori,adel,simontrinifarb
Tweet ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
858853042841669632,True,True,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
858853209003225088,True,True,True,True,True,True,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
858853478529105921,True,True,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
858853695886413826,True,True,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
858853826543091714,True,True,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Checking if there is any NAN

In [97]:
Xnew[:100][Xnew[:100].isnull().any(axis=1)]

Unnamed: 0_level_0,claudi,clau,claud,audi,claudialopez,lopez,petr,gust,gustav,juan,...,tont,williamfarfanm,beln,pierd,cafesalud,mald,danielrm,histori,adel,simontrinifarb
Tweet ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


### Select the first 3750 entries (they are already sorted randomly)

In [164]:
prediction_rf = pd.DataFrame(index=Xnew[:100].index, data=rf_clf.predict_proba(Xnew[:100]));
new_training_df = prediction_rf[(prediction_rf >= 0.9).any(axis=1)].sample(frac=1)[:3750]

### Creating `Ynew`

In [166]:
Ynew = pd.Series(index=new_training_df.index, data=new_training_df[1]);
Ynew = Ynew.apply(lambda x : 1 if x > 0.9 else 0)
Ynew.head()

Tweet ID
858860347213705216    0
858854298200965121    0
858855127914086400    0
858864713991692289    0
858862159668891648    0
Name: 1, dtype: int64

# Creating df_svm and y_svm (with around 10.000 examples)

In [187]:
df_svm = df_rf.append(Xnew[Xnew.index.isin(Ynew.index)]);
y_svm = pd.Series(data=df['category'])
y_svm.index = df['Tweet ID']
y_svm = y_svm.append(Ynew)

# SVM
Run a svm algorithm on the 10.000 examples dataframe

### Checking if there is any NaN

In [189]:
y_svm[y_svm.isnull()]

Series([], dtype: int64)

In [192]:
Xsvm_train, Xsvm_test, ysvm_train, ysvm_test = train_test_split(df_svm, y_svm)

clf_svm = SVC().fit(Xsvm_train, ysvm_train)
bip()

print('Accuracy of SVC classifier on training set: {:.2f}'
     .format(clf_svm.score(Xsvm_train, ysvm_train)))
print('Accuracy of SVC classifier on test set: {:.2f}'
     .format(clf_svm.score(Xsvm_test, ysvm_test)))

Accuracy of SVC classifier on training set: 0.71
Accuracy of SVC classifier on test set: 0.68
