# 06_Linear_SVC
## Ray Hossain

## Setup

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
import numpy as np
import matplotlib.pyplot as plt
import itertools
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


### Loading

In [2]:
news = pd.read_csv("Processed_Data/51_textscored_wide.csv")

In [3]:
#news.head()

## Linear SVC but with columns & TF-IDF Aggregation

In [16]:
news.columns 

Index(['personid', 'duration', 'text_clean', 'secondvote', 'vote_left',
       'vote_green', 'vote_spd', 'vote_na_other', 'vote_fdp', 'vote_cdu',
       'vote_afd', 'news_proportion', 'age', 'ml_gender_female', 'ml_married',
       'ml_relationship', 'ml_employ', 'ml_degreestatus',
       'ml_political_interest', 'adjustedvote', 'polinterest',
       'adjustedincome', 'fgender', 'fvoceduc', 'frelationship',
       'fmaritalstatus', 'femployment', 'tf_idf_mean',
       'tf_idf_entity_weighted', 'tf_idf_importance_weighted', 'left_right'],
      dtype='object')

In [17]:
import pandas as pd

# Assuming 'news' is your DataFrame

# Check which columns have any NA values
columns_with_na = news.isna().any()

# Filter to get the columns that have any NA values (where True)
df_with_na_columns = news.loc[:, columns_with_na]

# Output the result
print(df_with_na_columns)


      duration
0     111731.0
1       5429.0
2       1317.0
3       1548.0
4        792.0
...        ...
1339       NaN
1340       NaN
1341       NaN
1342       NaN
1343       NaN

[1344 rows x 1 columns]


In [18]:
# Set all NA values in the DataFrame to 0
news = news.fillna(0)


Attempting with just (semi) continuous variables.

In [19]:

# Define predictors (y variables)
predictors = ["secondvote"]

# Feature sets
features_list = [
    'duration', 'news_proportion', 'age', 'ml_gender_female', 'ml_married', 'ml_relationship',
    'ml_employ', 'ml_degreestatus', 'ml_political_interest', 'polinterest', 'adjustedincome']

must_include_list = ['tf_idf_mean', 'tf_idf_entity_weighted', 'tf_idf_importance_weighted']

# Identify categorical and continuous features
categorical_features = ['polinterest', 'adjustedincome'] 
continuous_features = ['duration', 'news_proportion', 'age'] + must_include_list

# Generate feature subsets ensuring at least one 'must_include' feature is present
feature_combinations = []
for r in range(1, len(features_list) + 1):
    for subset in itertools.combinations(features_list, r):
        for must_include in must_include_list:
            feature_combinations.append(list(subset) + [must_include])

# Remove duplicates and ensure valid features only
feature_combinations = [list(set(fc)) for fc in feature_combinations]
feature_combinations = [list(fc) for fc in set(tuple(sorted(fc)) for fc in feature_combinations)]

# Track results
results = []

# Train models for each feature combination and predictor
for y_var in predictors:
    for feature_set in feature_combinations:
        # Ensure selected features exist in the dataset
        valid_features = [feat for feat in feature_set if feat in news.columns]
        X = news[valid_features]
        y = news[y_var]

        # Encode categorical target
        y = (y == y.unique()[1]).astype(int)

        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        
        # Preprocessing pipeline
        preprocessor = ColumnTransformer(transformers=[
            ('num', StandardScaler(), [feat for feat in valid_features if feat in continuous_features]),
            ('cat', OneHotEncoder(handle_unknown='ignore'), [feat for feat in valid_features if feat in categorical_features])
        ])


        model = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', LinearSVC(dual=False, max_iter=5000, class_weight='balanced'))
        ])

        # Train model
        model.fit(X_train, y_train)

        # Predict and evaluate
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)

        # Store results
        results.append((y_var, valid_features, f1, precision, recall, accuracy))

# Sort and get top 5 results
top_5_results = sorted(results, key=lambda x: x[2], reverse=True)

# Convert to DataFrame and display
top_5_df = pd.DataFrame(top_5_results, columns=['Predictor', 'Features', 'F1 Score', 'Precision', 'Recall', 'Accuracy'])



In [20]:
top_5_df.sort_values(by='F1 Score', ascending=False).head()

Unnamed: 0,Predictor,Features,F1 Score,Precision,Recall,Accuracy
0,secondvote,"[adjustedincome, age, ml_employ, news_proporti...",0.360248,0.25,0.644444,0.6171
129,secondvote,"[adjustedincome, age, ml_employ, ml_gender_fem...",0.360248,0.25,0.644444,0.6171
163,secondvote,"[adjustedincome, age, duration, ml_degreestatu...",0.360248,0.25,0.644444,0.6171
164,secondvote,"[adjustedincome, age, duration, ml_degreestatu...",0.360248,0.25,0.644444,0.6171
165,secondvote,"[adjustedincome, age, duration, ml_employ, ml_...",0.360248,0.25,0.644444,0.6171


In [21]:
# Filter rows where 'Features' contains 'tf_idf_entity_weighted'
filtered_df = top_5_df[top_5_df['Features'].apply(lambda x: 'tf_idf_entity_weighted' in x)]
filtered_df_sorted = filtered_df.sort_values(by='F1 Score', ascending=False)
filtered_df_sorted.head()


Unnamed: 0,Predictor,Features,F1 Score,Precision,Recall,Accuracy
259,secondvote,"[adjustedincome, ml_degreestatus, ml_gender_fe...",0.343949,0.241071,0.6,0.6171
319,secondvote,"[adjustedincome, ml_degreestatus, ml_employ, m...",0.343949,0.241071,0.6,0.6171
322,secondvote,"[adjustedincome, ml_degreestatus, ml_employ, m...",0.343949,0.241071,0.6,0.6171
325,secondvote,"[adjustedincome, ml_employ, ml_married, ml_pol...",0.343949,0.241071,0.6,0.6171
326,secondvote,"[adjustedincome, news_proportion, tf_idf_entit...",0.343949,0.241071,0.6,0.6171


In [22]:
list(filtered_df_sorted['Features'][0:1])

[['adjustedincome',
  'ml_degreestatus',
  'ml_gender_female',
  'ml_married',
  'ml_political_interest',
  'ml_relationship',
  'news_proportion',
  'tf_idf_entity_weighted']]

The below seems to suggest that importance_weighted comes off slightly higher in terms of F1 scores

In [23]:
# Filter rows where 'Features' contains 'tf_idf_entity_weighted'
filtered_df2 = top_5_df[top_5_df['Features'].apply(lambda x: 'tf_idf_importance_weighted' in x)]

filtered_df2_sorted = filtered_df2.sort_values(by='F1 Score', ascending=False)
filtered_df2_sorted.head()

Unnamed: 0,Predictor,Features,F1 Score,Precision,Recall,Accuracy
256,secondvote,"[adjustedincome, news_proportion, tf_idf_impor...",0.343949,0.241071,0.6,0.6171
328,secondvote,"[adjustedincome, ml_employ, ml_married, ml_rel...",0.343949,0.241071,0.6,0.6171
331,secondvote,"[adjustedincome, ml_married, ml_political_inte...",0.343949,0.241071,0.6,0.6171
333,secondvote,"[adjustedincome, ml_employ, ml_married, ml_pol...",0.343949,0.241071,0.6,0.6171
334,secondvote,"[adjustedincome, ml_degreestatus, ml_gender_fe...",0.343949,0.241071,0.6,0.6171


In [24]:
list(filtered_df2_sorted['Features'][0:1])

[['adjustedincome', 'news_proportion', 'tf_idf_importance_weighted']]

Just checking to see where the accuracies from above falls

In [25]:
top_5_df['Accuracy'].sort_values(ascending=False)

0       0.617100
193     0.617100
263     0.617100
262     0.617100
261     0.617100
          ...   
5274    0.394052
5275    0.394052
5276    0.394052
5277    0.394052
5311    0.394052
Name: Accuracy, Length: 6141, dtype: float64