# IRS Practical 3
> 19BCE245 - Aayush Shah

## 1. Explore `CountVectorizer` and `TfidfVectorizer`

  - ### with `CountVectorizer` : 

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [None]:
corpus = [
  'This is the first document.',
  'This document is the second document.',
  'And this is the third one.',
  'Is this the first document?',
]

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

In [None]:
print(X.toarray())

In [None]:
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2,2))
X2 = vectorizer2.fit_transform(corpus)
vectorizer2.get_feature_names_out()

In [None]:
print(X2.toarray())

In [None]:
vectorizer3 = CountVectorizer(decode_error='ignore', stop_words='english', ngram_range=(1,3))
X3 = vectorizer3.fit_transform(corpus)
vectorizer3.get_feature_names_out()

In [None]:
print(X3.toarray())

  - ### with `TfidfVectorizer` : 

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

In [None]:
print(X.shape)
print(vectorizer.get_feature_names_out())
print(X.toarray())

## 2. Do 1st part with file handling.

In [None]:
# Making files
for i in range(len(corpus)):
  f = open("data"+str(i+1)+".txt", "w")
  f.write(corpus[i])
  f.close()

In [None]:
# Reading files
extracted_corpus = []
for i in range(len(corpus)):
  f = open("data"+str(i+1)+".txt",'r')
  extracted_corpus.append(f.read())

print(extracted_corpus)

In [None]:
# Extracting document names from current directory
doc_names = os.listdir('.')
print(doc_names)
doc_names = [i for i in doc_names if ('.txt' in i)]
print(doc_names)

- ### with `CountVectorizer` : 

In [None]:
vectorizer4 = CountVectorizer(input=doc_names)
X4 = vectorizer4.fit_transform(corpus)
vectorizer4.get_feature_names_out()

In [None]:
# files = ['data1.txt','data2.txt','data3.txt','data4.txt']

In [None]:
vectorizer4 = CountVectorizer(input=doc_names)
X4 = vectorizer4.fit_transform(extracted_corpus)
vectorizer4.get_feature_names_out()

In [None]:
print(X4.toarray())

In [None]:
print(X4.toarray())

- ### with `TfidfVectorizer` : 

In [None]:
vectorizer = TfidfVectorizer(input=doc_names)
X5 = vectorizer.fit_transform(extracted_corpus)
vectorizer.get_feature_names_out()

In [None]:
print(X5.shape)
print(vectorizer.get_feature_names_out())
print(X5.toarray())

## 3. Take part in competition 
> [Refer this notebook](https://www.kaggle.com/adamschroeder/countvectorizer-tfidfvectorizer-predict-comments/notebook)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
import  matplotlib.pyplot as plt
import tensorflow as tf
import keras
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
import pickle
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import roc_auc_score , accuracy_score , confusion_matrix , f1_score
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
train_df  =  pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
train_df

In [None]:
test_df=pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
test_df

In [None]:
def clean_review_text(text):
    text = text.lower()  # covert the text to lowercase
    text = re.sub('<.*?>','',text).strip() # remove html chars
    text = re.sub('\[|\(.*\]|\)','', text).strip() # remove text in square brackets and parenthesis
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove punctuation marks
    text = re.sub("(\\W)"," ",text).strip() # remove non-ascii chars
    text = re.sub('\S*\d\S*\s*','', text).strip()  # remove words containing numbers
    return text.strip()

In [None]:
train_df.comment_text = train_df.comment_text.astype(str)
train_df.comment_text = train_df.comment_text.apply(clean_review_text)
train_df.comment_text.head()

In [None]:
import nltk
from nltk.stem.snowball import SnowballStemmer
import en_core_web_sm
nlp = en_core_web_sm.load()

snow_stemmer = SnowballStemmer(language='english')

stopwords = nlp.Defaults.stop_words
def apply_stemmer(text):
    words = text.split()
    sent = [snow_stemmer.stem(word) for word in words if not word in set(stopwords)]
    return ' '.join(sent)

In [None]:
train_df.comment_text = train_df.comment_text.apply(apply_stemmer)
train_df.comment_text.head()

In [None]:
X = train_df.comment_text
y = train_df.drop(['id','comment_text'],axis = 1)

In [None]:
x_train,x_test,y_train,y_test =  train_test_split(X,y,test_size = 0.2,random_state = 45)

In [None]:
word_vectorizer = TfidfVectorizer(
    strip_accents='unicode',     
    analyzer='word',            
    token_pattern=r'\w{1,}',    
    ngram_range=(1, 3),         
    stop_words='english',
    sublinear_tf=True)

word_vectorizer.fit(x_train)    
train_word_features = word_vectorizer.transform(x_train)

In [None]:
X_train_transformed = word_vectorizer.transform(x_train)
X_test_transformed = word_vectorizer.transform(x_test)

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.linear_model import LogisticRegression
seed=100

log_reg = LogisticRegression(C = 10, penalty='l2', solver = 'liblinear', random_state=seed)

# fit model
classifier_ovr_log = OneVsRestClassifier(log_reg)
classifier_ovr_log.fit(X_train_transformed, y_train)

In [None]:
y_train_pred_proba = classifier_ovr_log.predict_proba(X_train_transformed)
y_test_pred_proba = classifier_ovr_log.predict_proba(X_test_transformed)

In [None]:
def make_test_predictions(df,classifier):
    df.comment_text = df.comment_text.apply(clean_review_text)
    df.comment_text = df.comment_text.apply(apply_stemmer)
    X_test = df.comment_text
    X_test_transformed = word_vectorizer.transform(X_test)
    y_test_pred = classifier.predict_proba(X_test_transformed)
    return y_test_pred

In [None]:
y_pred=make_test_predictions(test_df,classifier_ovr_log)
y_pred

In [None]:
y_pred_df = pd.DataFrame(y_pred,columns=y.columns)
y_pred_df

In [None]:
submission_df = pd.concat([test_df.id, y_pred_df], axis=1)
submission_df.to_csv('submission.csv', index = False)