In [1]:
import pandas as pd
data = pd.read_csv('BBC_DATA.csv')
data.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [2]:
data.columns

Index(['ArticleId', 'Text', 'Category'], dtype='object')

In [3]:
# Tokenization, Stemming and Lemmatization with NLTK

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

sample_article = data['Text'][0]  # Assuming the first article as the sample
words = word_tokenize(sample_article)
sentences = sent_tokenize(sample_article)

porter = PorterStemmer()
stemmed_words = [porter.stem(word) for word in words]

print("Tokenized Words:")
print(words)
print("\nTokenized Sentences:")
print(sentences)


lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

print("\nStemmed Words:")
print(stemmed_words)
print("\nLemmatized Words:")
print(lemmatized_words)


Tokenized Words:

Tokenized Sentences:

Stemmed Words:
['worldcom', 'ex-boss', 'launch', 'defenc', 'lawyer', 'defend', 'former', 'worldcom', 'chief', 'berni', 'ebber', 'against', 'a', 'batteri', 'of', 'fraud', 'charg', 'have', 'call', 'a', 'compani', 'whistleblow', 'as', 'their', 'first', 'wit', '.', 'cynthia', 'cooper', 'worldcom', 's', 'ex-head', 'of', 'intern', 'account', 'alert', 'director', 'to', 'irregular', 'account', 'practic', 'at', 'the', 'us', 'telecom', 'giant', 'in', '2002.', 'her', 'warn', 'led', 'to', 'the', 'collaps', 'of', 'the', 'firm', 'follow', 'the', 'discoveri', 'of', 'an', '$', '11bn', '(', '£5.7bn', ')', 'account', 'fraud', '.', 'mr', 'ebber', 'ha', 'plead', 'not', 'guilti', 'to', 'charg', 'of', 'fraud', 'and', 'conspiraci', '.', 'prosecut', 'lawyer', 'have', 'argu', 'that', 'mr', 'ebber', 'orchestr', 'a', 'seri', 'of', 'account', 'trick', 'at', 'worldcom', 'order', 'employe', 'to', 'hide', 'expens', 'and', 'inflat', 'revenu', 'to', 'meet', 'wall', 'street', 'ea

In [4]:
# Named Entity Recognition with SpaCy:

import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(sample_article)
entities = [(ent.text, ent.label_) for ent in doc.ents]

print("\nNamed Entities:")
print(entities)

from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)


Named Entities:
[('worldcom ex-boss', 'PERSON'), ('worldcom', 'ORG'), ('bernie', 'PERSON'), ('first', 'ORDINAL'), ('cynthia cooper  worldcom s ex-head', 'PERSON'), ('us', 'GPE'), ('2002', 'DATE'), ('5.7bn', 'MONEY'), ('worldcom', 'ORG'), ('new york', 'GPE'), ('wednesday', 'DATE'), ('arthur andersen', 'PERSON'), ('worldcom', 'ORG'), ('early 2001 and 2002', 'DATE'), ('worldcom', 'ORG'), ('cooper', 'PERSON'), ('worldcom financial', 'ORG'), ('scott sullivan', 'PERSON'), ('sullivan', 'PERSON'), ('2001', 'DATE'), ('85 years', 'DATE'), ('worldcom', 'ORG'), ('2004', 'DATE'), ('mci', 'ORG'), ('last week', 'DATE'), ('mci', 'ORG'), ('6.75bn', 'MONEY')]


In [5]:
# Word2Vec with gensim

from gensim.models import Word2Vec
word2vec_model = Word2Vec(sentences=data['Text'].apply(word_tokenize), vector_size=100, window=5, min_count=1, workers=4)
sample_word = 'technology'
sample_word_vector = word2vec_model.wv[sample_word]
print(f"\nVector representation of '{sample_word}':")
print(sample_word_vector)



Vector representation of 'technology':
[-0.4231125  -0.06354127 -0.03468318 -0.02177207 -0.24687235 -1.045194
  0.32568595  1.3702469  -0.06068807 -0.36313516 -0.5771616  -1.1075568
  0.09151505  0.5539394   0.17997731 -0.6815435  -0.04775981 -1.0573169
 -0.6435405  -1.2432263   0.9238907  -0.04779443  0.68750304 -0.4390832
  0.06762236 -0.4091824  -0.30868125 -0.37890086 -0.41640967 -0.1201301
  0.842313   -0.03963032  0.21407649 -0.74733573 -0.48620135  0.8797714
  0.32039687 -0.340942   -0.8634029  -0.84692985 -0.12229922 -0.57725495
 -0.4197596  -0.34886405  0.3902439  -0.27625385 -0.19691916 -0.52132314
  0.18854178  0.7920559  -0.05974095 -1.192997   -0.1467415   0.169373
 -0.09228957  0.01106747  0.20610292 -0.38572687 -0.7908143  -0.14005096
  0.80115277 -0.05560902  0.61248565  0.0434361  -0.9293523   1.2165565
 -0.14639191  0.09973356 -0.7424541   0.92896795  0.08198676  0.14157309
  0.8244816   0.00270061  0.7462051  -0.19099575  0.36708826 -0.31430268
  0.00991806  0.09122

In [6]:
# TF-IDF with scikit-learn:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Text'])

cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
print(f"\nCosine similarity between first two articles: {cosine_sim[0][0]}")


Cosine similarity between first two articles: 0.07875931547482325


## NLP on sms spam dataset

In [7]:
! pip install textblob



In [8]:
# Import necessary libraries
import pandas as pd
from textblob import TextBlob
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load the SMS Spam Collection Dataset into a pandas DataFrame
url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'
df = pd.read_csv(url, sep='\t', header=None)
df.columns = ['label', 'message']

df.head()



Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
# Sentiment Analysis with TextBlob
df['sentiment'] = df['message'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Named Entity Recognition with SpaCy
nlp = spacy.load('en_core_web_sm')
doc = nlp(df['message'].iloc[0])  # Example: Process the first message
for ent in doc.ents:
    print(ent.text, ent.label_)

# Text Classification with TF-IDF and scikit-learn
X = df['message']
y = df['label']

# Convert labels to binary: spam=1, ham=0
y = y.map({'spam': 1, 'ham': 0})

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize TF-IDF vectorizer and fit-transform the training data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = nb_classifier.predict(X_test_tfidf)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


jurong point PERSON
Accuracy: 0.9721973094170404
