![Stay Home](https://storage.googleapis.com/kaggle-datasets-images/1740/3025/a3b95419dcdc1ad06dbff6f54db18511/dataset-cover.jpg)

In [None]:
#importing common libraries
import pandas as pd
from matplotlib import pyplot as plt 
import seaborn as sns
import numpy as np
%matplotlib inline

In [None]:
#importing dataset from sklearn
from sklearn.datasets import fetch_20newsgroups
#importing train and test dataset
train= fetch_20newsgroups(subset="train" ,categories =[ "alt.atheism", "sci.space" ]) 
test= fetch_20newsgroups(subset="test" ,categories =[ "alt.atheism", "sci.space" ])
X_train = train["data"]
X_test=test['data']
y_train = train["target"] 
y_test=test['target']

In [None]:
#making a dataframe
df=pd.DataFrame(X_train,columns=['mess'])

In [None]:
#adding a target column
df['target']=y_train

In [None]:
#making length a feature for visualizations
df['length']=df['mess'].apply(len)
df.head()

## Visualizations

In [None]:
#BarPlot
sns.barplot(x='target',y='length',data=df)

In [None]:
g=sns.FacetGrid(df,hue='target',height=4,aspect=2)
g=g.map(sns.distplot,'length')
plt.legend()
#length is not a good feature they seems to be same

## Text Pre-processing

In [None]:
#importing string for punctuations
import string
#now we import most common words i.e. stopwords
from nltk.corpus import stopwords

In [None]:
#making a function to process our data
def text_process(mess):
    no_punc=[c for c in mess if c not in string.punctuation]
    no_punc=''.join(no_punc)
    cleaned_mess=[word for word in no_punc.split() if word.lower() not in stopwords.words('english')]
    return cleaned_mess

In [None]:
#applying our text_process function
#adding processed data to a new column
df['message']=df['mess'].apply(text_process)

In [None]:
#dropping our previous unprocessed column
df.drop('mess',axis=1,inplace=True)
df.head()

In [None]:
df['message'].head()

## Normalization & Vecorization

In [None]:
#Importing CountVectorizer to a collection of text documents to a matrix of token counts.
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(df['message'])
# Print total number of vocab words
print(len(bow_transformer.vocabulary_))

In [None]:
messages_bow = bow_transformer.transform(df['message'])
print('Shape of Sparse Matrix: ', messages_bow.shape)
print('Amount of Non-Zero occurences: ', messages_bow.nnz)

In [None]:
#Importing TfidfVectorizer to a collection of raw documents to a matrix of TF-IDF features.
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(messages_bow)

In [None]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

## Making Pipeline for Predictions

### This is the main part sklearn pipeline will automatically do all the stuff we did in Normalization & Vectorization

We just need to pass the function and also an additionl the method of prediction here **I have shown using Naive Bayes but you can choose any classifier**

In [None]:
#methods we will be using to predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

I have predicted using 3 methods but here I have shown using Naive Bayes just change MultinomialNB() to LogisticRegression() if you wanna predict using Logistic Regression

In [None]:
from sklearn.pipeline import Pipeline
pipeline=Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('Naive Bayes ',MultinomialNB())
])

In [None]:
#pipeline fit our train and test data
pipeline.fit(X_train,y_train)

In [None]:
#make predictions
predictions=pipeline.predict(X_test)

In [None]:
#Using Naive Bayes
from sklearn.metrics import f1_score
from sklearn import metrics
print('F1-score: ',metrics.f1_score(y_test,predictions, labels=np.unique(predictions)))
print('Accuracy: ',metrics.accuracy_score(y_test,predictions))

In [None]:
# #Using Logistic Regression
# from sklearn.metrics import f1_score
# from sklearn import metrics
# print('F1-score: ',metrics.f1_score(y_test,predictions, labels=np.unique(predictions)))
# print('Accuracy: ',metrics.accuracy_score(y_test,predictions))

<p style="padding: 10px;
              background-color:black;
                  color:white;">
F1-score:  0.953883495145631
    </p>
    <p style="padding: 10px;
              background-color:black;
                  color:white;">
Accuracy:  0.9467040673211781
    </p>

In [None]:
# #Using SVC
# from sklearn.metrics import f1_score
# from sklearn import metrics
# print('F1-score: ',metrics.f1_score(y_test,predictions, labels=np.unique(predictions)))
# print('Accuracy: ',metrics.accuracy_score(y_test,predictions))

<p style="padding: 10px;
              background-color:black;
                  color:white;">
F1-score:  0.963144963144963
    </p>
    <p style="padding: 10px;
              background-color:black;
                  color:white;">
Accuracy:  0.9579242636746143
    </p>

**Naive Bayes seems to perform best!**

<div class="alert alert-block alert-info" style="font-size:14px; font-family:verdana;">
    📌 Thanks for Learning! Hope to see you again!</div>