In [73]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

def get_train_test_data():
    import sqlite3
    conn = sqlite3.connect('tweets.db')
    sql = """
        select tweets.tid, date, text, churn, "set" from tweets join churn on churn.tid = tweets.tid
    """
    cursor = conn.cursor()
    cursor.execute(sql)
    tw_data = pd.DataFrame(cursor.fetchall())
    cursor.close()
    tw_data.columns = ['tid', 'date', 'text', 'churn', 'set']
    tw_data.tid = tw_data.tid.astype(np.int)
    tw_data.date = pd.to_datetime(tw_data.date)
    tw_data.pop('date')
    
    tw_train = tw_data[tw_data.set == 'training']
    tw_test = tw_data[tw_data.set == 'hidden']
    tw_train.pop('set')
    tw_test.pop('set')

    return tw_train, tw_test
tw_train_valid, tw_test = get_train_test_data()

tw_test_x = tw_test['text']
tw_test_y = tw_test['churn']

tw_train_valid_x = tw_train_valid['text']
tw_train_valid_y = tw_train_valid['churn']

tw_train_x, tw_valid_x, tw_train_y, tw_valid_y = train_test_split(tw_train_valid_x, tw_train_valid_y, test_size=0.25)

In [78]:
import re
import collections

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

MOST_RELATE_WORD_COUNT = 100

def get_relate_cv():
    pat = re.compile('[@,;.*/!?#]')
    replac_sym = re.compile(pat)

    freq_cv = TfidfVectorizer()
    sum_text = tw_train_x.map(lambda i: re.sub(pat, ' ', i)).sum().lower()
    ct = collections.Counter(sum_text.split())
    for k in ct.copy():
        if ct[k] <= 2:
            ct.pop(k)
    freq_cv.fit(list(ct.keys()))
    
    cv = TfidfVectorizer()
    train_feature = pd.DataFrame(freq_cv.transform(tw_train_x).toarray(), index=tw_train_x.index)
    train_feature.columns = freq_cv.get_feature_names()
    word_corr = (train_feature.corrwith(tw_train_y) - train_feature.corrwith(-tw_train_y+1)).dropna()
    relate_words = list(word_corr.sort_values(ascending=False).index)
    _bench = int(MOST_RELATE_WORD_COUNT/2)
    choice_words = relate_words[:_bench] + relate_words[-_bench:]
    cv.fit(choice_words)
    
    print('choice related words'.center(80, '#'))
    print(choice_words)
#     print(word_corr.sort_values(ascending=False))
    return cv

cv = get_relate_cv()

##############################choice related words##############################
['im', 'leaving', 'contract', 'tmobile', 'leave', 'wait', 'companies', 'goodbye', 'cancel', 'thinking', 'cant', 'till', 'hello', 'bye', 'my', 'done', 'to', 'months', 'expensive', 'lines', 'fee', 'switching', 'reason', 'canceling', 'december', 'mom', 'next', 'hurry', 'net10', 'weeks', 'insane', 'anyway', 'fuck', 'more', 'switch', 'prices', 'asses', 'bullshit', 'considering', 'soon', 'swear', 'yea', 'hate', 'metro', 'finna', 'verizonwireless', 'they', 'maybe', 'gigs', 'wit', 'frustrated', 'guy', 'havent', 'users', 'call', 'verse', 'home', 'sfgiants', 'update', '450', 'fios', 'lumia', 'into', 'white', 'calls', 'like', 'never', 'it', 'commercials', 'probably', 'customers', 'before', 'store', 'down', 'on', 'called', 'work', 'note', 'and', 'didnt', 'text', 'see', 'tired', 'lte', 'he', 'disappointed', 'end', 'us', 'in', 'co', 'give', 'said', 'every', 'mobile', 'game', 'had', 'your', 'park', 'center', 'the']


In [79]:
clf = LinearSVC()

from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

train_fea = pd.DataFrame(cv.transform(tw_train_x).toarray(), columns=cv.get_feature_names())
clf.fit(train_fea, tw_train_y).fit(train_fea, tw_train_y).fit(train_fea, tw_train_y)

valid_fea = pd.DataFrame(cv.transform(tw_valid_x).toarray(), columns=cv.get_feature_names())
print(clf.score(valid_fea, tw_valid_y))
print(classification_report(tw_valid_y, clf.predict(valid_fea)))


0.8202247191011236
             precision    recall  f1-score   support

        0.0       0.86      0.92      0.89       491
        1.0       0.60      0.45      0.52       132

avg / total       0.81      0.82      0.81       623



In [80]:
import pickle

test_fea = pd.DataFrame(cv.transform(tw_test_x).toarray(), columns=cv.get_feature_names())
predict_test = pd.DataFrame(clf.predict(test_fea).astype(np.int), index=tw_test.tid, columns=['Churn'])
predict_test.to_csv('ID_666_Q_2_4_1.csv')

with open('ID_666_Q_2_4_1.pickle', 'wb') as f:
    pickle.dump(clf, f)

In [72]:
len(tw_train_valid)

2492