<a href="https://colab.research.google.com/github/Satyamaadi/Algorithms/blob/main/NLP_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

In [2]:
posts = fetch_20newsgroups(subset='all',categories = ['sci.electronics','sci.space'],remove=('headers','quotes','footers'))

In [3]:
posts.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [4]:
posts.data[1]

"AL>>        Question:   Is there a certain device out there that I can\nAL>>                    use to find out the number to the line?\nAL>>        Thanks for any response.\nAL>>                                                    Al\n\nAL>There is a number you can call which will return a synthesized\nAL>voice telling you the number of the line.  Unfortunately, for the\nAL>life of me I can't remember what it is. The telephone technicians\nAL>use it all the time.  We used to play around with this in our\nAL>dorm rooms since there were multiple phone lines running between\nAL>rooms.\n\nIt probably wouldn't help for you to post the number, since it appears\nto be different in each area.  For what it's worth, in the New Orleans\narea the number is 998-877-6655 (easy to remember, what?)\n\n\n * SLMR 2.1 * Ask me anything: if I don't know, I'll make up something.\n                                          "

In [5]:
df = pd.DataFrame({
    'text': posts.data,
    'label': [posts.target_names[target] for target in posts.target]
})

In [6]:
df

Unnamed: 0,text,label
0,\n >\tIf the new Kuiper belt object *is* ...,sci.space
1,AL>> Question: Is there a certain dev...,sci.electronics
2,"\nIt's not quite what you were asking, but a f...",sci.space
3,"\n\n\nNo, the sky does not, at this time, belo...",sci.space
4,"\nDigi-Key also sells Quad Line Receivers, pa...",sci.electronics
...,...,...
1966,\n\n\nThanks again. One final question. The ...,sci.space
1967,\nCheck the station's master sync generator. ...,sci.electronics
1968,Brian Yamauchi asks: [Regarding orbital billbo...,sci.space
1969,\n\nWhy not design the solar arrays to be deta...,sci.space


In [7]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [8]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
def clean_text(text):
  tokens = word_tokenize(text)
  tokens = [word for word in tokens if word.isalpha()]
  tokens = [word.lower() for word in tokens]
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words]
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(word) for word in tokens]
  clean_text = ' '.join(tokens)
  return clean_text

In [10]:
df['cleaned_text'] = df['text'].apply(clean_text)

In [11]:
clean_df = df[['cleaned_text','label']]

In [12]:
X = clean_df['cleaned_text']
y = clean_df['label']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=1)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(min_df=10)
X_train_count = count_vect.fit_transform(X_train)
X_test_count = count_vect.transform(X_test)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(max_df=0.7,min_df=0.01)
tfidf_train = tfidf_vect.fit_transform(X_train)
tfidf_test = tfidf_vect.transform(X_test)

In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [18]:
nb = MultinomialNB()
nb.fit(X_train_count,y_train)
y_pred = nb.predict(X_test_count)
metrics.accuracy_score(y_test,y_pred)


0.9006085192697769

In [19]:
labels = ['sci.electronics','sci.space']
cm = metrics.confusion_matrix(y_test,y_pred,labels=labels)


In [20]:
cm

array([[227,   7],
       [ 42, 217]])