# Run only once

In [231]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression

# Necessary Functions

In [384]:
def leave_only_letters(series):
    #changing emojis
    #TODO
    
    #Removing puntuation
    series = series.str.lower().str.replace(r'[^a-zA-Z ]+','') \
        .str.replace(r' {2,}',' ')
        
    #Removing digits
    series = series.str.replace(r'\d','')
    
    #Removing accents
    series = series.str.replace(r'á|à','a') \
        .str.replace(r'é|è','e') \
        .str.replace(r'í|ì','i') \
        .str.replace(r'ó|ò','o') \
        .str.replace(r'ú|ù','u')
        
    return series

class LemmaTokenizer(object):
    def __init__(self):
        #self.wnl = WordNetLemmatizer()
        self.stemmer = nltk.stem.snowball.SnowballStemmer('spanish')
    
    def __call__(self, doc):
        #return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
        return [self.stemmer.stem(t) for t in word_tokenize(doc)]

# Loading files

In [385]:
df = pd.read_csv('tweets.csv', encoding='latin-1', sep=';', usecols=['Tweet ID', 'Tweet Text', 'Positive for CL? 1=Yes, 0=No'], index_col='Tweet ID')
df = df.rename(columns={'Positive for CL? 1=Yes, 0=No': 'category', 'Tweet Text': 'tweet'})

In [386]:
df['tweet'] = leave_only_letters(df['tweet'])

In [387]:
df[df['tweet'].str.contains('000')]

Unnamed: 0_level_0,tweet,category
Tweet ID,Unnamed: 1_level_1,Unnamed: 2_level_1


In [388]:
df.describe()

Unnamed: 0,category
count,6250.0
mean,0.30176
std,0.459058
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


## Spliting set

In [389]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['category'])

## Feature vector

### Method 1 (no feature extraction method)

In [390]:
vect = CountVectorizer(
    analyzer='word', 
    ngram_range=(1,3),
    stop_words=nltk.corpus.stopwords.words('spanish'),
    lowercase=True,
    min_df=0.001,
    tokenizer=LemmaTokenizer()
).fit(X_train)
len(vect.get_feature_names())

2142

In [391]:
vect.get_feature_names()[:10]

['aaaf',
 'aaaf claudialopez',
 'aaaf claudialopez ficogutierrez',
 'aaaf mafezinh',
 'aaaf mafezinh claudialopez',
 'aabenedetti',
 'abaddonex',
 'abort',
 'abraz',
 'abrileac']

### Method 2 (tf-idf)

In [407]:
vect = TfidfVectorizer(
    analyzer='word', 
    ngram_range=(1,3),
    stop_words=nltk.corpus.stopwords.words('spanish'),
    lowercase=True,
    min_df=0.001,
    tokenizer=LemmaTokenizer()
).fit(X_train)

len(vect.get_feature_names())

2142

In [408]:
vect.get_feature_names()[:10]

['aaaf',
 'aaaf claudialopez',
 'aaaf claudialopez ficogutierrez',
 'aaaf mafezinh',
 'aaaf mafezinh claudialopez',
 'aabenedetti',
 'abaddonex',
 'abort',
 'abraz',
 'abrileac']

## Creating the vector traning examples

In [409]:
X_train_vector = vect.transform(X_train)
X_test_vector = vect.transform(X_test)

In [410]:
X_train_vector

<4687x2142 sparse matrix of type '<class 'numpy.float64'>'
	with 44267 stored elements in Compressed Sparse Row format>

# Random Forest

In [503]:
rf_csf = RandomForestClassifier(min_samples_leaf=10).fit(X_train_vector, y_train)

In [504]:
print("Score in training set: %f" % rf_csf.score(X_train_vector, y_train))
print("Score in test set: %f" % rf_csf.score(X_test_vector, y_test))

Score in training set: 0.759334
Score in test set: 0.716571


# Logistic Regression

In [509]:
lr_csf = LogisticRegression().fit(X_train_vector, y_train)
print("Score in training set: %f" % lr_csf.score(X_train_vector, y_train))
print("Score in test set: %f" % lr_csf.score(X_test_vector, y_test))

Score in training set: 0.815660
Score in test set: 0.754958
