In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import  TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

train = pd.read_csv("train.csv",encoding="latin-1")
test = pd.read_csv("test.csv",encoding="latin-1")

In [15]:
train.shape

(31962, 3)

In [16]:
from nltk.corpus import stopwords
import string
import re
punctuation = string.punctuation
stopwords = stopwords.words("english")
from nltk.stem.wordnet import WordNetLemmatizer

lem = WordNetLemmatizer()

def _clean(text):
    text = text.lower()
    text = re.sub(r"amp", "", text)
    #text = re.sub(r"#\w+","",text)
    text = re.sub(r"@\w+","",text)
    text = "".join(x for x in text if x not in punctuation)
    
    words = text.split()
    #words = [" " if w.startswith("@") else w for w in words ]
    #words = [w for w in words if w != "user" ]
    #words = [w for w in words if w != "amp" ]
    
    #words = [w for w in words if w not in stopwords]
    words = [lem.lemmatize(word,"v") for word in words]
    #words = [lem.lemmatize(word,"n") for word in words]
    words = [w for w in words if len(w) >2]
    
    
    
    text = " ".join(words)
    return text

def remove_non_ascii(text):
    L = [32, 44, 46, 65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103, 104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122]
    text = str(text)

    return ''.join(i for i in text if ord(i) in L)

In [17]:
train["tweet"] = train["tweet"].apply(_clean)
train["tweet"] = train["tweet"].apply(remove_non_ascii)

test["tweet"] = test["tweet"].apply(_clean)
test["tweet"] = test["tweet"].apply(remove_non_ascii)

In [18]:
#train['tweet'] = train['tweet'].str.replace('\d+', '')
#test['tweet'] = test['tweet'].str.replace('\d+', '')
train_text = train['tweet']
test_text = test['tweet']
all_text = pd.concat([train_text, test_text])

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{2,}',
    stop_words='english',
    ngram_range=(1, 2),
    max_features=10000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

In [19]:
tfidf = dict(zip(word_vectorizer.get_feature_names(),word_vectorizer.idf_))
tfidf_idf = pd.DataFrame(columns=["word_tfidf"]).from_dict(tfidf,orient="index")
tfidf_idf.columns = ["word_tfidf"]
tfidf_idf

Unnamed: 0,word_tfidf
aap,8.306328
aap spokesperson,9.605611
abandon,9.163778
abc,9.404940
abe,9.856925
...,...
zone,8.624782
zoo,9.094785
zootopia,9.856925
zoro,10.193398


In [20]:
tfidf = dict(zip(char_vectorizer.get_feature_names(),char_vectorizer.idf_))
tfidf_idf = pd.DataFrame(columns=["word_tfidf"]).from_dict(tfidf,orient="index")
tfidf_idf.columns = ["word_tfidf"]
tfidf_idf

Unnamed: 0,word_tfidf
a,1.856863
aa,7.512376
aap,8.219317
ab,4.349853
abl,6.966554
...,...
zz,7.725298
zza,7.303026
zza,7.675701
zzl,8.506999


In [21]:
# scores = []
# submission = pd.DataFrame.from_dict({'id': test['id']})
# train_target = train['label']
# classifier = LogisticRegression(C=0.1, solver='sag')

# cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
# scores.append(cv_score)
# print('CV score for class {} is {}'.format("label", cv_score))

# classifier.fit(train_features, train_target)

# submission['label'] = classifier.predict_proba(test_features)[:, 1]


#submission.to_csv('submission.csv', index=False)

In [22]:
train_target = train['label']
# model = LogisticRegression()
# model.fit(train_features,train_target)
# preds = model.predict(test_features)

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

train_x, val_x, train_y, val_y = train_test_split(train_features,train_target)

model = LogisticRegression(solver='newton-cg')
model.fit(train_x,train_y)
preds = model.predict(val_x)
print(accuracy_score(preds,val_y))
print(f1_score(val_y, preds, average="macro"))
CM = confusion_matrix(val_y, preds)
TN = CM[0][0]
print(TN)
FN = CM[1][0]
print(FN)
TP = CM[1][1]
print(TP)
FP = CM[0][1]
print(FP)


0.9579526967838818
0.7898499679723792
7401
312
254
24


In [23]:
# model = ensemble.ExtraTreesClassifier()
# model.fit(train_features,train_target)
# preds = model.predict(test_features)

model = ensemble.ExtraTreesClassifier(n_estimators=100)
model.fit(train_x,train_y)
preds = model.predict(val_x)
print(accuracy_score(preds,val_y))
print(f1_score(val_y, preds, average="macro"))
CM = confusion_matrix(val_y, preds)
TN = CM[0][0]
print(TN)
FN = CM[1][0]
print(FN)
TP = CM[1][1]
print(TP)
FP = CM[0][1]
print(FP)

0.9653360030033789
0.8423508571535201
7386
238
328
39


In [24]:
# Creating an empty data frame
submission = pd.DataFrame()
preds = model.predict(test_features)
# Assigning values to the data frame-submission_kaggle
submission['id'] = test.id
submission['label'] = preds
submission.head()

Unnamed: 0,id,label
0,31963,0
1,31964,1
2,31965,0
3,31966,0
4,31967,0


In [25]:
submission['label'].value_counts()

0    16386
1      811
Name: label, dtype: int64

In [26]:
# saving data as  final_kaggle.csv
submission.to_csv('submission.csv',index=False)