In [2]:
import time, json
import pandas as pd
from pprint import pprint

import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from scipy.sparse import hstack
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, FeatureHasher, TfidfTransformer
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

import smtplib
from email.mime.text import MIMEText

import warnings
warnings.filterwarnings('ignore')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sharhad.bashar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Load your data from the CSV file
df = pd.read_csv('../data/train/train.csv')
pprint(df.columns)

# Separate features and labels
X = df[['browserFamily', 'deviceType', 'os', 'combine']]
y = df['iab_categories']
confidence_scores = df['confidence']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test, confidence_train, confidence_test = train_test_split(X, y, confidence_scores, test_size = 0.2, random_state = 42)


Index(['id', 'browserFamily', 'deviceType', 'os', 'iab_categories',
       'confidence', 'combine'],
      dtype='object')


In [4]:
print(X_train.shape, y_train.shape, confidence_train.shape)

(7298348, 4) (7298348,) (7298348,)


In [79]:
def bag_of_words(X, col = 'name_title'):
    vectorizer = CountVectorizer(stop_words = 'english')
    X = vectorizer.fit_transform(X)
    return X

def one_hot_encoding(X, col = 'name_title'):
    one_hot_encoder = OneHotEncoder()
    X = X.values.reshape(-1, 1)
    X = one_hot_encoder.fit_transform(X)
    return X

def word_2_vector(X):
    w2v_model = gensim.models.Word2Vec(X, vector_size = 100, window = 5, min_count = 2)

def glove(X):
    return X

def tfidf(X, col = 'name_title'):
    tfidf_vectorizer = TfidfVectorizer(max_df = 0.8, max_features = 10000)
    return tfidf_vectorizer.fit_transform(X[col])

def countvector_tfidtransform(X, col = ''):
    cv = CountVectorizer(stop_words = 'english')
    tfidf = TfidfTransformer()
    X = cv.fit_transform(X[col])
    return tfidf.fit_transform(X)
    # pipeline = Pipeline([
    #     ('vect', CountVectorizer(stop_words = 'english')),
    #     ('tfidf', TfidfTransformer()),
    # ])
    # return pipeline.fit_transform(X)

In [5]:
# Define a ColumnTransformer to separately process text and categorical features
preprocessor = ColumnTransformer(
    transformers = [
        ('text', CountVectorizer(), 'combine'), #replace with TfidfVectorizer
        ('categorical', OneHotEncoder(), ['browserFamily', 'deviceType', 'os'])
    ],
    remainder = 'passthrough'
)
# Create a pipeline with the preprocessor and SGDClassifier
clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(loss = 'log', random_state = 42, class_weight = 'balanced'))
])

# clf = Pipeline([
#     ('vectorizer', CountVectorizer()),
#     ('scaler', StandardScaler(with_mean = False)),
#     ('classifier', SGDClassifier(loss = 'log', random_state = 42, class_weight = 'balanced'))
# ])

In [82]:
# clf.fit(X_train, y_train, classifier__sample_weight = confidence_train)

In [60]:
# Make predictions on the test set
predictions = clf.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.09055583537534796


SGDClassifier Accuracy: 0.14313814578312792 -> 32 min

LogisticRegression Accuracy: 0.35889656124920327 -> 319 min

In [6]:
def train(classifier, classifier_name):
    preprocessor = ColumnTransformer(
        transformers = [
            ('text', CountVectorizer(), 'combine'), 
            ('categorical', OneHotEncoder(), ['browserFamily', 'deviceType', 'os'])
        ],
        remainder = 'passthrough'
    )
    clf = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])
    
    start = time.time()
    clf.fit(X_train, y_train, classifier__sample_weight = confidence_train)
    end = time.time() - start
    
    predictions = clf.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    
    return classifier_name, accuracy, end

In [7]:
class Email:
    def __init__(self):
        config = json.load(open('../config/email.json'))
        self.name = config['name']
        self.sender = config['sender']
        self.to = config['to']
        self.token = config['token']

    def send_email(self, subject, message):
        msg = MIMEText(message)
        msg['To'] = self.to
        msg['From'] = self.sender
        msg['Subject'] = subject
        
        with smtplib.SMTP_SSL('smtp.gmail.com', 465) as smtp_server:
            smtp_server.login(self.sender, self.token)
            smtp_server.sendmail(self.sender, self.to, msg.as_string())
            
# email = Email()
# classifier_name = 'log'
# accuracy = 0.09055583537534796
# end = 1232
# message = f'{classifier_name} has an accuracy of {round(accuracy, 5)} and took {end} to run'
# email.send_email('subject', message)

In [9]:
classfiers = [
    [RandomForestClassifier(class_weight = 'balanced'), 'random forest'],
    [LogisticRegression(class_weight = 'balanced'), 'logistic regression'],
    [SGDClassifier(loss = 'log', random_state = 42, class_weight = 'balanced'), 'sgd classifier'],
    [LinearSVC(class_weight = 'balanced'), 'linear svc'],
    [GradientBoostingClassifier(), 'g boost'],
    [KNeighborsClassifier(), 'k neighbor'],
    [DecisionTreeClassifier(class_weight = 'balanced'), 'decision tree'],
    [MLPClassifier(), 'mlp classifier'],
    [MultinomialNB(), 'multinomial nb']
]

for classifier, classifier_name in classfiers:
    classifier_name, accuracy, end = train(classifier, classifier_name)
    print(classifier_name, accuracy, end)
    email = Email()
    message = f'{classifier_name} has an accuracy of {round(accuracy, 5)} and took {end} to run'
    email.send_email(classifier_name, )
    