In [1]:
import pandas as pd

import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier

from fuzzywuzzy import process
from fuzzywuzzy import fuzz

In [2]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder

In [3]:
meta_clusters = pd.read_csv('../data/meta_clusters_with_formatted_name.csv')
demo_clusters = pd.read_csv("../../cc_election_cleaning/district_level_demo_clusters.csv")

In [4]:
faction_compare = pd.read_csv('../../citycouncil_tweets/data/faction_compare.csv')

In [46]:
dc = demo_clusters[['matched_name','demo_cluster','district']]

In [47]:
dc.columns = ['demo_name','demo_cluster','district']

In [48]:
fc = faction_compare[['VotePersonName', 'faction']]

In [49]:
mc = meta_clusters[['proper_name', 'k_spon','louv','vote_cluster']]

In [50]:
k_mode = meta_clusters[['proper_name', 'kmode_cluster']]

In [10]:
def fuzzy_match_names(name, choices):
    return process.extractOne(name, choices, scorer=fuzz.token_sort_ratio)

In [51]:
dc['demo_name'] = dc['demo_name'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dc['demo_name'] = dc['demo_name'].astype(str)


In [52]:
mc['matched_name'] = mc['proper_name'].apply(lambda x: fuzzy_match_names(x, fc['VotePersonName'])[0])
dc['matched_name'] = dc['demo_name'].apply(lambda x: fuzzy_match_names(x, fc['VotePersonName'])[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mc['matched_name'] = mc['proper_name'].apply(lambda x: fuzzy_match_names(x, fc['VotePersonName'])[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dc['matched_name'] = dc['demo_name'].apply(lambda x: fuzzy_match_names(x, fc['VotePersonName'])[0])


In [53]:
k_mode['matched_name'] = k_mode['proper_name'].apply(lambda x: fuzzy_match_names(x, fc['VotePersonName'])[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_mode['matched_name'] = k_mode['proper_name'].apply(lambda x: fuzzy_match_names(x, fc['VotePersonName'])[0])


In [54]:
joined = mc.merge(fc, left_on='matched_name', right_on='VotePersonName')
joined = joined.merge(dc, on='matched_name')

In [55]:
joined = joined.drop(columns=['proper_name', 'VotePersonName', 'demo_name'])

In [50]:
joined['faction'].value_counts()

faction
Affluent Liberals           4
Hybrid Progressives         3
DSA                         2
Alphabet Left               2
Old and New Republicans     2
County Aligned Moderates    1
Name: count, dtype: int64

In [56]:
df = joined.copy()

In [57]:
features = ['k_spon', 'louv', 'vote_cluster', 'demo_cluster']
df_train = df[df['faction'].notnull()].copy()
df_test = df[df['faction'].isnull()].copy()

In [58]:
le = LabelEncoder()
df_train['faction_encoded'] = le.fit_transform(df_train['faction'])
y_train = df_train['faction_encoded']
X_train = df_train[features]
X_test = df_test[features]

In [59]:
model_nb = GaussianNB()
model_nb.fit(X_train, y_train)
pred_nb = model_nb.predict(X_test)
# Convert numeric predictions back to original faction names
pred_nb_labels = le.inverse_transform(pred_nb)
# Add predictions to original DataFrame for rows with missing faction
df.loc[df['faction'].isnull(), 'pred_nb'] = pred_nb_labels

In [60]:
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train, y_train)
pred_rf = model_rf.predict(X_test)
pred_rf_labels = le.inverse_transform(pred_rf)
df.loc[df['faction'].isnull(), 'pred_rf'] = pred_rf_labels

In [61]:
model_knn = KNeighborsClassifier(n_neighbors=3)
model_knn.fit(X_train, y_train)
pred_knn = model_knn.predict(X_test)
pred_knn_labels = le.inverse_transform(pred_knn)
df.loc[df['faction'].isnull(), 'pred_knn'] = pred_knn_labels

In [62]:
# if faction col is not null assign it to each prediction column
df['pred_nb'] = df['pred_nb'].fillna(df['faction'])
df['pred_rf'] = df['pred_rf'].fillna(df['faction'])
df['pred_knn'] = df['pred_knn'].fillna(df['faction'])

In [63]:
df = df.merge(k_mode, on='matched_name')

In [64]:
df.head(1)

Unnamed: 0,k_spon,louv,vote_cluster,matched_name,faction,demo_cluster,district,pred_nb,pred_rf,pred_knn,proper_name,kmode_cluster
0,0,0,0,Rita C. Joseph,,3,40,Affluent Liberals,Hybrid Progressives,Affluent Liberals,Rita C. Joseph,0


In [65]:
df.drop(columns=['proper_name'], inplace=True)

In [79]:
df['pred_knn'].value_counts()

pred_knn
Affluent Liberals           30
Hybrid Progressives          4
DSA                          4
Old and New Republicans      4
County Aligned Moderates     2
Alphabet Left                2
Name: count, dtype: int64

In [66]:
df[df['kmode_cluster'] == 3]

Unnamed: 0,k_spon,louv,vote_cluster,matched_name,faction,demo_cluster,district,pred_nb,pred_rf,pred_knn,kmode_cluster
2,0,0,2,Kristin Richardson Jordan,,2,9,Alphabet Left,Hybrid Progressives,Hybrid Progressives,3
19,0,0,2,Chi A. Ossé,,3,36,Alphabet Left,Hybrid Progressives,DSA,3
22,1,0,2,Tiffany Cabán,DSA,4,22,DSA,DSA,DSA,3
25,1,0,2,Sandy Nurse,Hybrid Progressives,2,37,Hybrid Progressives,Hybrid Progressives,Hybrid Progressives,3
29,1,0,2,Alexa Avilés,DSA,4,38,DSA,DSA,DSA,3
30,1,0,2,Jennifer Gutiérrez,Alphabet Left,4,34,Alphabet Left,Alphabet Left,Alphabet Left,3
42,3,0,2,Shahana K. Hanif,Alphabet Left,1,39,Alphabet Left,Alphabet Left,Alphabet Left,3
44,3,0,2,Lincoln Restler,,1,33,Alphabet Left,Alphabet Left,Affluent Liberals,3


In [67]:
df.to_csv('../../shiny_meta_council_clustering/faction_predictions.csv', index=False)