# Run only once

In [92]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# Necessary Functions

In [183]:
def leave_only_letters(series):
    #changing emojis
    #TODO
    
    #Removing puntuation
    series = series.str.lower().str.replace(r'[^a-zA-Z ]+','') \
        .str.replace(r' {2,}',' ')
        
    #Removing digits
    series = series.str.replace(r'\dx','')
    
    #Removing accents
    series = series.str.replace(r'á|à','a') \
        .str.replace(r'é|è','e') \
        .str.replace(r'í|ì','i') \
        .str.replace(r'ó|ò','o') \
        .str.replace(r'ú|ù','u')
        
    return series

# Loading files

In [184]:
df = pd.read_csv('tweets.csv', encoding='latin-1', sep=';', usecols=['Tweet ID', 'tweet text', 'Positive for CL? 1=Yes, 0=No'], index_col=0)
df = df.rename(columns={'Positive for CL? 1=Yes, 0=No': 'category', 'tweet text': 'tweet'})

In [185]:
df['tweet'] = leave_only_letters(df['tweet'])

In [186]:
df['tweet']

Tweet ID
868212518879211000    jorgepatarroyo claudialopez aquel que lucha co...
869654185729556000    rt perrofantasmabo wradiocolombia claudialopez...
868675105613860000    camilabaron galloalex chaveztrump vickydavilah...
869654122408136000    wradiocolombia claudialopez las confiscadas af...
867892970594467000    andresdiaz laluciernaga dianacalderonf hora cl...
867580682960351000    rt claudialopez cinismo uribista volver perseg...
869952317088518000    josephralph claudialopez las uribestialidades ...
867859946389602000    juankortiz johnherd caracolradio claudialopez ...
870145530600583000    thrino castroedwinc jeisonlubo emaumor claudia...
869986124059598000    rt alexa claudialopez se las da de pulcra de h...
870331912727937000    crodzmart ricardog mateogqsg claudialopez el e...
870722234586943000    tatiannarive ginachef leer que claudialopez y ...
867942017908846000    claudialopez cuando leo estos comentarios no s...
867770966620995000    claudialopez disculpen pero polti

In [187]:
df.describe()

Unnamed: 0,category
count,6250.0
mean,0.30176
std,0.459058
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


## Spliting set

## Feature vector

### Method 1 (no feature extraction method)

In [188]:
vect = CountVectorizer(
    strip_accents='unicode', 
    analyzer='word', 
    ngram_range=(1,3),
    stop_words=nltk.corpus.stopwords.words('spanish'),
    lowercase=True,
    min_df=0.001
).fit(X_train)
len(vect.get_feature_names())

2117

In [189]:
vect.get_feature_names()[:10]

['000',
 '085aaa7125124f3',
 '085aaa7125124f3 claudialopez',
 '085aaa7125124f3 claudialopez ficogutierrez',
 '085aaa7125124f3 mafezinha86',
 '085aaa7125124f3 mafezinha86 claudialopez',
 '10',
 '100',
 '12',
 '13']

### Method 2 (tf-idf)

In [190]:
vect = TfidfVectorizer(
    strip_accents='unicode', 
    analyzer='word', 
    ngram_range=(1,3),
    stop_words=nltk.corpus.stopwords.words('spanish'),
    lowercase=True,
    min_df=0.001,
).fit(X_train)

len(vect.get_feature_names())

2117

In [191]:
vect.get_feature_names()[:10]

['000',
 '085aaa7125124f3',
 '085aaa7125124f3 claudialopez',
 '085aaa7125124f3 claudialopez ficogutierrez',
 '085aaa7125124f3 mafezinha86',
 '085aaa7125124f3 mafezinha86 claudialopez',
 '10',
 '100',
 '12',
 '13']

## Creating the vector traning examples (Op 1)

In [192]:
X_train_vector = vect.transform(X_train)
X_test_vector = vect.transform(X_test)

In [193]:
X_train_vector

<4687x2117 sparse matrix of type '<class 'numpy.float64'>'
	with 37795 stored elements in Compressed Sparse Row format>

# Random Forest

In [194]:
rf_csf = RandomForestClassifier().fit(X_train_vector, y_train)

In [195]:
print("Score in training set: %f" % rf_csf.score(X_train_vector, y_train))
print("Score in test set: %f" % rf_csf.score(X_test_vector, y_test))

Score in training set: 0.968423
Score in test set: 0.747281
