In [476]:
import logging
import numpy as np
import pandas as pd
import nltk
import random
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
random.seed(265)

In [477]:
df_train = pd.read_csv("/Users/revan/Downloads/congressional_tweet_training_data.csv")
df_test = pd.read_csv("/Users/revan/Downloads/congressional_tweet_test_data.csv")

In [578]:
df_train['full_text'] = df_train['full_text'].str[1:]
df_test['full_text'] = df_test['full_text'].str[1:]

In [573]:
STOP_WORDS = set(stopwords.words('english'))
english_punctuations = string.punctuation
punctuations_list = english_punctuations
translator = str.maketrans('', '', punctuations_list)
tokenizer = RegexpTokenizer(r'\w+')
st = nltk.PorterStemmer()
lmtzr = WordNetLemmatizer()

In [636]:
def clean_text(text_str):
    text_str = text_str.lower()
    text_str = re.sub(r"http\S+", "", text_str) #removes links
    text_str = re.sub(r'\d+',"", text_str)
    text_str = re.sub(r'[^\w\s]', " ", text_str) #removes punctuation
    text_str = nltk.word_tokenize(text_str)
    text_str = [word for word in text_str if word not in STOP_WORDS]
    text_str = [word for word in text_str if len(word) > 2 and not re.match(r'\d+',  word)]
    return ' '.join(text_str)
    

In [637]:
df_train["clean_text"] = df_train["full_text"].map(clean_text)
df_test["clean_text"] = df_test["full_text"].map(clean_text)

In [638]:
def remove_Stopwords(text):
    stopW=stopwords.words('english') #get the english stopwords
    return " ".join([i for i in text.split() if i not in stopW])

In [639]:
df_train['clean_text']=df_train['clean_text'].apply(lambda x:remove_Stopwords(x))
df_test['clean_text']=df_test['clean_text'].apply(lambda x:remove_Stopwords(x))

In [640]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize(text):
    # 1. Init Lemmatizer
    lemmatizer = WordNetLemmatizer()
    # 2. Lemmatize text with the appropriate POS tag
    return " ".join([lemmatizer.lemmatize(i, get_wordnet_pos(i)) for i in text.split()])

In [641]:
df_train['clean_text'] = df_train['clean_text'].apply(lambda x:lemmatize(x))
df_test['clean_text'] = df_test['clean_text'].apply(lambda x:lemmatize(x))

In [642]:
df_train[["clean_text","full_text"]].head(1)
print(df_train['full_text'].iloc[1])
print(df_train['clean_text'].iloc[1])

Today I'm urging the @CDCgov to immediately launch a 24/7 phone hotline to address questions from Americans regarding the #Coronavirus.\n\nI'm also urging the agency to hold regular calls with state &amp; local health officials to provide up-to-date info &amp; provide any resources needed. https://t.co/xRzNim8RHM"
today urge cdcgov immediately launch phone hotline address question american regard coronavirus also urge agency hold regular call state amp local health official provide date info amp provide resource need


In [643]:
#df_train["clean_hashtags"] = df_train["hashtags"].map(clean_hashtags)
#df_test["clean_hashtags"] = df_test["hashtags"].map(clean_hashtags)

In [644]:
#df_train["text_2"] =  df_train["clean_hashtags"] + " "  + df_train["clean_text"]  
#df_test["text_2"] =   df_test["clean_hashtags"] + " " + df_test["clean_text"] 

In [661]:
def linearscv(df):    
    #print(df["clean_text"])
    print("starting linearscv")
    x = df['clean_text'].tolist()
    tfid = TfidfVectorizer(strip_accents='ascii', ngram_range=(1,2), max_features=9000, lowercase= True)
    tf = tfid.fit_transform(x)
    #print('No. of feature_words: ', tfid.get_feature_names())
    return (tf)

In [662]:
def accuracy(df_train, df_test):
    y_train = df_train['party_id'].map({'D': 0, 'R': 1}).tolist()
    y_test = df_test['party'].map({'D': 0, 'R': 1}).tolist()
    x_train = linearscv(df_train)
    x_test = linearscv(df_test)
    #print(x_train.shape, x_test.shape)
    print("starting logistic regression")
    clf1 = LogisticRegression(C=2, random_state = 265, max_iter=10000, n_jobs = -1).fit(x_train, y_train)
    print("starting linear regression")
    clf2 = LinearSVC().fit(x_train, y_train)
    #print("starting logisticCV regression")
    #clf3 = LogisticRegressionCV(cv = 5, random_state = 265, max_iter=10000, n_jobs = -1).fit(x_train, y_train)
    score1 = clf1.score(x_test, y_test)
    score2 = clf2.score(x_test, y_test)
    #score3 = clf3.score(x_test, y_test)
    y_predict = clf2.predict(x_test) 
    #print(y_predict)
    results = [] 
    results.append(["LR" , score1])
    results.append(["SVC" , score2])
    #results.append(["LRCV" , score3])
    print(pd.DataFrame(results, columns=['label', 'accuracy']))
    return  y_predict    

In [663]:
y_predict = accuracy(df_train, df_test)

starting linearscv
starting linearscv
starting logistic regression
starting linear regression
  label  accuracy
0    LR  0.607977
1   SVC  0.612204


In [657]:
def create_output(df_test, y_pred):
    df_out = df_test[["Id"]].copy()
    df_out["party"] = y_pred
    df_out["party"] = df_out["party"].map({0:'D' , 1:'R'})
    df_out.to_csv("/Users/revan/Downloads/final_submission14.csv", index=False)
    print(df_out.head(10))

In [658]:
create_output(df_test, y_predict)

   Id party
0   0     R
1   1     D
2   2     R
3   3     R
4   4     R
5   5     R
6   6     D
7   7     R
8   8     D
9   9     D


In [439]:
df_train[["text_2"]].tail(100)

Unnamed: 0,clean_hashtags,hashtags
592703,FloridiansHelpingFloridiansFloridaStrong,FloridiansHelpingFloridians FloridaStrong
592704,SeeMyOhio11ClevelandAkronOH11,SeeMyOhio11 Cleveland Akron OH11
592705,HeroesActFamiliesFirst,HeroesAct FamiliesFirst
592706,2020Election,2020Election
592707,SmallBiz,SmallBiz
...,...,...
592798,publicservicepublicsafety,publicservice publicsafety
592799,StormyDanielsMichaelWolfeJamesComey,StormyDaniels MichaelWolfe JamesComey
592800,CultureOfCorruption,CultureOfCorruption
592801,appcopoliticsCAC16HouseOfCodeco06,app copolitics CAC16 HouseOfCode co06


In [None]:
def clean_text(text_str):
    text_str = text_str.lower()
    text_str = re.sub(r"http\S+", "", text_str) #removes links
    text_str = re.sub(r'\d+',"", text_str)
    text_str = re.sub(r'[^\w\s]', " ", text_str) #removes punctuation
    tokens = nltk.word_tokenize(text_str)
    tagged = nltk.pos_tag(tokens)
    lemmatized_sentence = []
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), tagged))
    for word, tag in tagged:
        if tag is None:
            # if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:       
            # else use the tag to lemmatize the token
            #print(word)
            lemmatized_sentence.append(lmtzr.lemmatize(word, pos_tagger(tag)))
    lemmatized_sentence = " ".join(lemmatized_sentence)
    word_tokens = nltk.word_tokenize(lemmatized_sentence)
    filtered_sentence = []
    for w in word_tokens:
        if len(w) <= 2 or re.match(r'\d+',  w): continue
        if w not in stop_words:
            filtered_sentence.append(w)
    return ' '.join(filtered_sentence)

def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return wordnet.NOUN