### Assignment1: Text Classification 
- Name: Shrey Patel
- Student ID: 101541370

##### Algorithms:
- Multinomial Naïve Bayes
- Logistic Regression 
- Support Vector Machines
- decision Tree

##### Using NLP features Extraction:
- Countvectorizer
- TFIDFvectorizer
- Word2vec



In [2]:
from pprint import pprint
from time import time

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
import numpy as np

from gensim.models import Word2Vec, doc2vec
from gensim.models.doc2vec import TaggedDocument

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
data_news = fetch_20newsgroups(subset='all')
X = data_news.data
y = data_news.target

print(f'Number of samples: {len(X)}')

Number of samples: 18846


- Fetching data for train and test

In [4]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

print(F'TRAIN SAMPLES: {len(X_train_raw)}')
print(f'Test sample: {len(X_test_raw)}')

TRAIN SAMPLES: 13192
Test sample: 5654


In [11]:
def get_countvec(train, test):
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(train)
    X_test = vectorizer.transform(test)
    return X_train, X_test

def get_tfidf(train, test):
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train)
    X_test = vectorizer.transform(test)
    return X_train, X_test

def get_word2vec(train, test):
    train_tok = [doc.split() for doc in train]
    test_tok = [doc.split() for doc in test]
    
    model = Word2Vec(sentences=train_tok, vector_size=100, window=5, min_count=1, workers=4, epochs= 20)
    X_train = np.array([
        np.mean([model.wv[word] for word in words if word in model.wv]
                or [np.zeros(100)], axis=0) for words in train_tok
    ])
    X_test = np.array([
        np.mean([model.wv[word] for word in words if word in model.wv]
                or [np.zeros(100)], axis=0) for words in test_tok
    ])
    return X_train, X_test
    


In [12]:
feature_extractors = {
    'CountVectorizer': get_countvec,
    'TF-IDF Vectorizer': get_tfidf,
    'Word2Vec': get_word2vec,
}

algorithms = {
    'Multinomial Naïve Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Support Vector Machines': LinearSVC(),
    'Decision Trees': DecisionTreeClassifier()
}

In [13]:
results = []

for feature_name, feat_fun in feature_extractors.items():
  print(f'\n Feature Extracting: {feature_name}')
  X_train_feat, X_test_feat = feat_fun(X_train_raw, X_test_raw)

  
  for algo_name, alg in algorithms.items():
    print(f'\n algoriths: {algo_name}', end=" -")
    try:
      model = alg.fit(X_train_feat, y_train)
      prediction = model.predict(X_test_feat) 
      
      accuracy_val = accuracy_score(y_test, prediction)*100
      
      # adding results in the list
      
      results.append({
        'Algorithm': algo_name,
        'Feature Extractor': feature_name,
        'Accuracy (%)': round(accuracy_val, 2)
      })
      
      print(f'Accuracy: {round(accuracy_val,2)}%')
    except Exception as e:
      print(f'Skipping due to error occurence: {e}')
      continue


 Feature Extracting: CountVectorizer

 algoriths: Multinomial Naïve Bayes -Accuracy: 85.04%

 algoriths: Logistic Regression -Accuracy: 88.73%

 algoriths: Support Vector Machines -



Accuracy: 88.68%

 algoriths: Decision Trees -Accuracy: 64.52%

 Feature Extracting: TF-IDF Vectorizer

 algoriths: Multinomial Naïve Bayes -Accuracy: 84.79%

 algoriths: Logistic Regression -Accuracy: 89.58%

 algoriths: Support Vector Machines -



Accuracy: 93.21%

 algoriths: Decision Trees -Accuracy: 62.81%

 Feature Extracting: Word2Vec

 algoriths: Multinomial Naïve Bayes -Skipping due to error occurence: Negative values in data passed to MultinomialNB (input X)

 algoriths: Logistic Regression -Accuracy: 57.61%

 algoriths: Support Vector Machines -



Accuracy: 56.07%

 algoriths: Decision Trees -Accuracy: 24.05%


In [18]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Accuracy (%)', ascending=False)

print('\n Benchmarked Results: ')
print(results_df.to_string(index=False))


 Benchmarked Results: 
              Algorithm Feature Extractor  Accuracy (%)
Support Vector Machines TF-IDF Vectorizer         93.21
    Logistic Regression TF-IDF Vectorizer         89.58
    Logistic Regression   CountVectorizer         88.73
Support Vector Machines   CountVectorizer         88.68
Multinomial Naïve Bayes   CountVectorizer         85.04
Multinomial Naïve Bayes TF-IDF Vectorizer         84.79
         Decision Trees   CountVectorizer         64.52
         Decision Trees TF-IDF Vectorizer         62.81
    Logistic Regression          Word2Vec         57.61
Support Vector Machines          Word2Vec         56.07
         Decision Trees          Word2Vec         24.05


In [19]:
results_df.to_csv('Shrey_task1_text_classification.txt', sep='\t', index=False)
print("\n Grate!!Output is saved to 'Shrey_task1_text_classification.txt'")



 Grate!!Output is saved to 'Shrey_task1_text_classification.txt'
