In [1]:
import pandas as pd
import numpy as np
#for text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
#for word embedding
import gensim
from gensim.models import Word2Vec

[nltk_data] Downloading package punkt to /Users/yiweihan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/yiweihan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yiweihan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df_train= pd.read_csv('congressional_tweet_training_data.csv')
df_test=pd.read_csv('congressional_tweet_test_data.csv')

In [3]:
x=df_train['party_id'].value_counts()
print(x)

D    324202
R    268601
Name: party_id, dtype: int64


In [4]:
df_train.isna().sum()

favorite_count        0
full_text             0
hashtags              0
retweet_count         0
year              18712
party_id              0
dtype: int64

In [5]:
df_train['word_count'] = df_train['full_text'].apply(lambda x: len(str(x).split()))
print(df_train[df_train['party_id']=='R']['word_count'].mean()) 
print(df_train[df_train['party_id']=='D']['word_count'].mean()) 

22.86333260114445
26.960345710390435


In [6]:
df_train['char_count'] = df_train['full_text'].apply(lambda x: len(str(x)))
print(df_train[df_train['party_id']=='R']['char_count'].mean()) 
print(df_train[df_train['party_id']=='D']['char_count'].mean()) 

168.91757662853078
197.6636356345735


In [7]:
#convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

 
# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)
#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

In [8]:
def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))
df_train.full_text.to_list()
text_cleaned_train = []
for text in df_train['full_text']:
    text_cleaned_train.append(finalpreprocess(text))
df_train['TextClean'] = text_cleaned_train

In [9]:
text_train = []
for i in range(len(df_train['TextClean'])):
    text_train.append(df_train['TextClean'].iloc[i][1:])
df_train['TextClean'] = text_train
df_train.head()

Unnamed: 0,favorite_count,full_text,hashtags,retweet_count,year,party_id,word_count,char_count,TextClean
0,0,"b""RT @KUSINews: One of our longtime viewers wa...",KUSI,10,2017.0,R,24,154,rt kusinews one longtime viewer congressman d...
1,258,"b""Today I'm urging the @CDCgov to immediately ...",Coronavirus,111,2020.0,R,44,317,today urge cdcgov immediately launch phone ho...
2,0,"b'Tomorrow, #MO03 seniors graduate from Calvar...",MO03,2,2014.0,R,21,140,tomorrow mo senior graduate calvary lutheran ...
3,9,b'Congrats to #TeamUSA and Canton Native @JGre...,TeamUSA WorldJuniors,3,2017.0,R,16,130,congrats teamusa canton native jgreenway win ...
4,3,b'Pleased to support @amergateways at their Ju...,ImmigrantHeritageMonth,3,2019.0,D,39,316,pleased support amergateways june fiesta hono...


In [10]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df_train['party_id'] = encoder.fit_transform(df_train["party_id"])

In [11]:
df_train.head()

Unnamed: 0,favorite_count,full_text,hashtags,retweet_count,year,party_id,word_count,char_count,TextClean
0,0,"b""RT @KUSINews: One of our longtime viewers wa...",KUSI,10,2017.0,1,24,154,rt kusinews one longtime viewer congressman d...
1,258,"b""Today I'm urging the @CDCgov to immediately ...",Coronavirus,111,2020.0,1,44,317,today urge cdcgov immediately launch phone ho...
2,0,"b'Tomorrow, #MO03 seniors graduate from Calvar...",MO03,2,2014.0,1,21,140,tomorrow mo senior graduate calvary lutheran ...
3,9,b'Congrats to #TeamUSA and Canton Native @JGre...,TeamUSA WorldJuniors,3,2017.0,1,16,130,congrats teamusa canton native jgreenway win ...
4,3,b'Pleased to support @amergateways at their Ju...,ImmigrantHeritageMonth,3,2019.0,0,39,316,pleased support amergateways june fiesta hono...


In [12]:
df_train['TextHashtag'] = df_train['TextClean']+ ' ' + 2 * df_train['hashtags']

In [13]:
df_test.full_text.to_list()
text_cleaned_test = []
for text in df_test['full_text']:
    text_cleaned_test.append(finalpreprocess(text))
df_test['TextClean'] = text_cleaned_test

In [14]:
text_test = []
for i in range(len(df_test['TextClean'])):
    text_test.append(df_test['TextClean'].iloc[i][1:])
df_test['TextClean'] = text_test
df_test.head()

Unnamed: 0,Id,favorite_count,full_text,hashtags,retweet_count,year,party,TextClean
0,0,70,b'#TaxReform improved the playing field for Am...,TaxReform,13,2018.0,D,taxreform improve playing field american work...
1,1,27,"b'This #NativeWomensEqualPay Day, we recommit ...",NativeWomensEqualPay,11,,D,nativewomensequalpay day recommit pass payche...
2,2,49,"b""\xe2\x80\x9cI became convinced that our gene...",MeToo ShatteringTheSilence,24,2017.0,D,xe x x ci become convinced generation silence...
3,3,14,"b'During #NationalAdoptionMonth, we honor the ...",NationalAdoptionMonth,2,2019.0,D,nationaladoptionmonth honor adoptive parent p...
4,4,13,b'Happy #AirborneDay to our @USArmy paratroope...,AirborneDay AirborneAllTheWay,7,2018.0,D,happy airborneday usarmy paratrooper veteran ...


In [15]:
df_test['party'] = encoder.fit_transform(df_test["party"])

In [16]:
df_test['TextHashtag'] = df_test['TextClean']+ ' ' + 2 * df_test['hashtags']

In [19]:
#Tf-Idf
tv12 = TfidfVectorizer(lowercase=True, ngram_range=(1,2),min_df=2)
X_tv_train = tv12.fit_transform(df_train['TextHashtag'])
y_train = df_train.party_id.tolist()
X_tv_test = tv12.transform(df_test['TextHashtag'])
y_test = df_test.party.tolist()

In [20]:
from sklearn.linear_model import LogisticRegressionCV
lf = LogisticRegressionCV(cv = 2, random_state = 265, max_iter = 200, n_jobs = -1)
lf.fit(X_tv_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegressionCV(cv=2, max_iter=200, n_jobs=-1, random_state=265)

In [21]:
Y_predict = lf.predict(X_tv_test)

In [22]:
final_df = pd.DataFrame(Y_predict)

In [23]:
final_df['Id']=range(0,len(final_df))

In [24]:
final_df = final_df.rename(columns={0:'party'})
final_df.head()

Unnamed: 0,party,Id
0,1,0
1,0,1
2,0,2
3,1,3
4,1,4


In [25]:
Idd = final_df['Id']
final_df = final_df.drop('Id',axis =1)
final_df.insert(0,'Id',Idd)
final_df.head()

Unnamed: 0,Id,party
0,0,1
1,1,0
2,2,0
3,3,1
4,4,1


In [26]:
def fun_c(x):
    if x == 1: 
        return 'R'
    else: 
        return 'D'
final_df['party']=final_df['party'].apply(lambda x: fun_c(x))

In [27]:
final_df.head()

Unnamed: 0,Id,party
0,0,R
1,1,D
2,2,D
3,3,R
4,4,R


In [32]:
final_df.to_csv('/Users/yiweihan/Desktop/sample_submission20.csv')

In [None]:
#We also tried several models below, and the best is logisticregressioncv

#from sklearn.linear_model import RidgeClassifier
#rc=RidgeClassifier()

#from sklearn.naive_bayes import MultinomialNB
#nb=MultinomialNB()

In [None]:
#We see how it works when cleaning the hashtags, but after cleaning and merge, we found out useing
#cleaned_full_text and not cleaned hashtags are better
#ht_train = df_train.hashtags.to_list()
#ht_test = df_test.hashtags.to_list()
#text_cleaned_train_ht = []
#for text in df_train['hashtags']:
    #text_cleaned_train_ht.append(clean_text(text))
#df_train['Text_HT'] = text_cleaned_train_ht
#text_cleaned_test_ht = []
#for text in df_test['hashtags']:
    #text_cleaned_test_ht.append(clean_text(text))
#df_test['Text_HT'] = text_cleaned_test_ht

In [None]:
#We tried several parameter, including cv and tv from ngram(1,1) to (1,3)
#we found out tv(1,2) with min_df=2 is the best

#cv11 = CountVectorizer(lowercase=True,ngram_range=(1,1),min_df=2)

In [None]:
#we merge cleaned text and hashtag togeter, because they are both text
#and hashtags are shorter and more important and clearer, so we gave the weight to 2
#we also tried 3, but the result did not change much