In [159]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from scipy import stats
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.calibration import CalibratedClassifierCV
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [160]:
X_train = pd.read_csv("train.csv")
X_test = pd.read_csv('test.csv')

In [161]:
X_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,8,,,#RockyFire Update => California Hwy. 20 closed...,1
4,13,,,I'm on top of the hill and I can see a fire in...,1


In [162]:
X_train[X_train['target']==1]['text']

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       #RockyFire Update => California Hwy. 20 closed...
4       I'm on top of the hill and I can see a fire in...
                              ...                        
5341    Suicide bomber kills 15 in Saudi security site...
5342    Two giant cranes holding a bridge collapse int...
5343    @aria_ahrary @TheTawniest The out of control w...
5344    Police investigating after an e-bike collided ...
5345    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 2291, dtype: object

In [163]:
X_train[X_train['target']==0]['text']

8                                          What's up man?
9                                           I love fruits
10                                       Summer is lovely
11                                      My car is so fast
12                                 this is ridiculous....
                              ...                        
5325    @widda16 ... He's gone. You can relax. I thoug...
5326     @jt_ruff23 @cameronhacker and I wrecked you both
5327    Three days off from work and they've pretty mu...
5328    @engineshed Great atmosphere at the British Li...
5329    Cramer: Iger's 3 words that wrecked Disney's s...
Name: text, Length: 3055, dtype: object

In [164]:
print('X_train.shape(sin procesar): ', X_train.shape)
print('X_test.shape(sin procesar): ', X_test.shape)

X_train.shape(sin procesar):  (5346, 5)
X_test.shape(sin procesar):  (2267, 4)


In [165]:
# Eliminamos los duplicados
X_train = X_train.drop_duplicates(subset='text')
print('X_train.shape(sin duplicados): ', X_train.shape)
X_test = X_test.drop_duplicates(subset='text')
print('X_test.shape(sin duplicados): ', X_test.shape)

X_train.shape(sin duplicados):  (5286, 5)
X_test.shape(sin duplicados):  (2247, 4)


In [166]:
import re

signos = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)|(\>)|(\=)|(\<)")
signos_arroba = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)|(\>)|(\=)|(\<)|(\@)")

def signs_tweets(tweet):
    return signos.sub('', tweet.lower())

X_train['text'] = X_train['text'].apply(signs_tweets)
X_train['text'].head()
X_test['text'] = X_test['text'].apply(signs_tweets)
X_test['text'].head()

0     people receive #wildfires evacuation orders i...
1    just got sent this photo from ruby #alaska as ...
2    #flood #disaster heavy rain causes flash flood...
3    there's an emergency evacuation happening now ...
4    i'm afraid that the tornado is coming to our area
Name: text, dtype: object

In [167]:
def remove_links(df):
    return " ".join(['{link}' if ('http') in word else word for word in df.split()])

X_train['text'] = X_train['text'].apply(remove_links)
X_test['text'] = X_test['text'].apply(remove_links)

In [168]:
from nltk.corpus import stopwords

english_stopwords = stopwords.words('english')

def remove_stopwords(df):
    return " ".join([word for word in df.split() if word not in english_stopwords])

X_train['text'] = X_train['text'].apply(remove_stopwords)
X_train.head()
X_test['text'] = X_test['text'].apply(remove_stopwords)
X_test.head()

Unnamed: 0,id,keyword,location,text
0,6,,,people receive #wildfires evacuation orders ca...
1,7,,,got sent photo ruby #alaska smoke #wildfires p...
2,10,,,#flood #disaster heavy rain causes flash flood...
3,14,,,there's emergency evacuation happening buildin...
4,15,,,i'm afraid tornado coming area


In [169]:
from nltk.stem.snowball import SnowballStemmer

def english_stemmer(x):
    stemmer = SnowballStemmer('english')
    return ' '.join([stemmer.stem(word) for word in x.split()])

X_train['text'] = X_train['text'].apply(english_stemmer)
X_test['text'] = X_test['text'].apply(english_stemmer)

In [170]:
X_train = X_train[['text', 'target']]

In [171]:
"""vectorizer = CountVectorizer(binary=True)
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)"""


'vectorizer = CountVectorizer(binary=True)\nvectorizer.fit(X_train)\nX_train = vectorizer.transform(X_train)'

In [172]:
pipeline = Pipeline([
    ('vect', vectorizer),
    ('cls', SVC(probability=True))
])

In [175]:
# Aqui definimos el espacio de parámetros a explorar
parameters = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigramas or bigramas
    'cls__C': (0.2, 0.5, 0.7),
    'cls__degree': [2,3,4,5],
    'cls__gamma': [0.001, 0.1, "auto", 1.0, 10.0, 30.0]
}


grid_search = GridSearchCV(pipeline,
                          parameters,
                          cv=5,
                          n_jobs=-1,
                          scoring='roc_auc')

In [None]:
grid_search.fit(X_train['text'], X_train['target'])

In [None]:
print("Best params:", grid_search.best_params_)
print("Best acc:", grid_search.best_score_)
print("Best model:", grid_search.best_estimator_)

In [None]:
grid_search.predict(X_test['text'])

SIN GRID SEARCH

In [142]:
svm = LinearSVC()
clf = CalibratedClassifierCV(svm)
clf.fit(X_train['text'], X_train['target'])
y_proba = clf.predict_proba(X_test['text'])

IndexError: ignored

In [86]:
X_train.shape

AttributeError: ignored