# Model 4

## Import libraries

In [31]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier, LassoCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer
from sklearn.feature_extraction import text 
from sklearn.metrics import mean_absolute_error, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier, VotingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

import xgboost as xgb

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer


import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords


## Readin Subreddit Data

In [32]:
df = pd.read_csv('../data/Subreddits_Data.csv', usecols = ['subreddit','selftext', 'title', 'corrected_title', 'char_count', 'word_count', 'neg', 'neu', 'pos', 'compound'])
df[['selftext', 'title', 'corrected_title']] = df[['selftext', 'title', 'corrected_title']].fillna('')

## CountVectorizer

In [33]:
stop_words  = stopwords.words('english')

count_vec_pipe = Pipeline(
                            [
                                ('tfid', TfidfVectorizer(stop_words=stop_words, ngram_range=(1,1))),
                                # ('vect', CountVectorizer(stop_words=stop_words, ngram_range=(1,1))),
                            ]
                        )
count_vec_pipe.fit(df['corrected_title'])
df_count_vec = pd.DataFrame(count_vec_pipe.transform(df['corrected_title']).A, columns=count_vec_pipe.get_feature_names_out())


In [34]:

master_df = pd.concat([df, df_count_vec], axis=1)
master_df = master_df.loc[:,~master_df.columns.duplicated()]

In [35]:
cat_col = ['corrected_title']
num_col = ['word_count', 'char_count', 'compound', 'pos', 'neu', 'neg']

## Finding best Model

In [23]:
X = master_df.drop(columns=['subreddit', 'selftext', 'title', 'corrected_title', 'compound', 'pos', 'neu', 'neg'])
y = master_df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
ct = ColumnTransformer([
    ('ss', StandardScaler(), num_col), # numeric columns
    ('adb', AdaBoostClassifier()),
    ('boosting', xgb.XGBClassifier()),
    ('bagging', BaggingClassifier()),
    ('rfc', RandomForestClassifier()),
    ('lr', LogisticRegression(max_iter=10_000)),
], remainder = 'passthrough')
mnb_model = Pipeline(
                    [
                        ('clf', MultinomialNB(alpha=.1))
                    ]
                )
mnb_model.fit(X_train, y_train)
mnb_model.score(X_train, y_train),mnb_model.score(X_test, y_test)

(0.9150648538834193, 0.7473634872275603)

In [66]:
X = master_df.drop(columns=['subreddit', 'selftext', 'title', 'corrected_title'])
y = master_df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

vc1 = VotingClassifier(
    [
        ('adb', AdaBoostClassifier()),
        ('boosting', xgb.XGBClassifier()),
        ('bagging', BaggingClassifier()),
        ('etc', ExtraTreesClassifier()),
    ],
    n_jobs=-1
)
ct1 = ColumnTransformer([
    ('ss', StandardScaler(), num_col), # polynomial numeric columns
], remainder = 'passthrough')
pipe = Pipeline([
    ('ct', ct1),
    ('vc1', vc1)
])
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9794692038057086, 0.8119555422048663)

In [67]:
X = master_df.drop(columns=['subreddit', 'selftext', 'title', 'corrected_title'])
y = master_df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

vc1 = VotingClassifier(
    [
        ('adb', AdaBoostClassifier()),
        ('boosting', xgb.XGBClassifier()),
        ('bagging', BaggingClassifier()),
        ('lr', LogisticRegression(max_iter=10_000)),
    ],
    n_jobs=-1
)
ct1 = ColumnTransformer([
    ('ss', StandardScaler(), num_col), # polynomial numeric columns
], remainder = 'passthrough')
pipe = Pipeline([
    ('ct', ct1),
    ('vc1', vc1)
])
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9457185778668002, 0.807149294082307)

In [68]:
X = master_df.drop(columns=['subreddit', 'selftext', 'title', 'corrected_title'])
y = master_df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

vc1 = VotingClassifier(
    [
        ('adb', AdaBoostClassifier()),
        ('boosting', xgb.XGBClassifier()),
        ('bagging', BaggingClassifier()),
        ('rfc', RandomForestClassifier()),
    ],
    n_jobs=-1
)
ct1 = ColumnTransformer([
    ('ss', StandardScaler(), num_col), # polynomial numeric columns
], remainder = 'passthrough')
pipe = Pipeline([
    ('ct', ct1),
    ('vc1', vc1)
])
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9792689033550326, 0.8104535896665666)

In [25]:
X = master_df.drop(columns=['subreddit', 'selftext', 'title', 'corrected_title'])
y = master_df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

vc1 = VotingClassifier(
    [
        ('adb', AdaBoostClassifier()),
        ('boosting', xgb.XGBClassifier()),
        ('bagging', BaggingClassifier()),
        ('rfc', RandomForestClassifier()),
        ('etc', ExtraTreesClassifier()),
    ],
    weights = [.1, .1, .1, .30, .30],
    n_jobs=-1
)
ct1 = ColumnTransformer([
    ('ss', StandardScaler(), num_col), # polynomial numeric columns
], remainder = 'passthrough')
pipe = Pipeline([
    ('ct', ct1),
    ('vc1', vc1)
])
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)


KeyboardInterrupt: 

In [26]:
X = master_df.drop(columns=['subreddit', 'selftext', 'title', 'corrected_title'])
y = master_df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

vc1 = VotingClassifier(
    [
        ('adb', AdaBoostClassifier()),
        ('boosting', xgb.XGBClassifier()),
        ('bagging', BaggingClassifier()),
        ('rfc', RandomForestClassifier()),
        ('etc', ExtraTreesClassifier()),
    ],
    weights = [.1, .1, .1, .30, .30],
    n_jobs=-1
)
ct1 = ColumnTransformer([
    ('ss', StandardScaler(), num_col), # polynomial numeric columns
], remainder = 'passthrough')
pipe = Pipeline([
    ('ct', ct1),
    ('vc1', vc1)
])
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(1.0, 0.8706351066322944)

## Tuning HyperParams

- `Note` couldnt do hyperparam tunning because time complexity would take multiple days.  Would even happen when only including one hyperparm in gridsearch cv. Dont really understand why.

In [36]:
X = master_df.drop(columns=['subreddit', 'selftext', 'title', 'corrected_title'])
y = master_df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

vc1 = VotingClassifier(
    [
        ('adb', AdaBoostClassifier()),
        ('boosting', xgb.XGBClassifier(booster='gbtree')),
        ('bagging', BaggingClassifier()),
        ('rfc', RandomForestClassifier()),
        ('etc', ExtraTreesClassifier()),
    ],
    weights = [.1, .1, .1, .30, .30],
)
ct1 = ColumnTransformer([
    ('ss', StandardScaler(), num_col), # polynomial numeric columns
], remainder = 'passthrough')
pipe2 = Pipeline([
    ('ct', ct1),
    ('vc1', vc1)
])
# pipe.fit(X_train, y_train)
# pipe.score(X_train, y_train), pipe.score(X_test, y_test)
pipe2_params = {
    'vc1__boosting__gamma': [0,1,2],
    'vc1__boosting__lambda':[0, 1, 2],
    'vc1__boosting__alpha': [0, 1, 2], 
    'vc1__rfc__criterion': ['gini', 'entropy'],
    'vc1__etc__criterion': ['gini', 'entropy'],    
}
gs = GridSearchCV(
    pipe2,
    pipe2_params,
    n_jobs=-1
)
gs.fit(X_train, y_train)
gs.score(X_train, y_train), gs.score(X_test, y_test)

## Resources
- https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
- https://www.kaggle.com/getting-started/42409
- https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html