Apply NLP techniques like Part-of-Speech tagging, Named Entity Recognition (NER), and
dependency parsing for text understanding. Perform text classification, sentiment analysis, and topic modeling to
extract insights from unstructured text.

Libraries Required

In [2]:
!pip install pandas numpy nltk spacy scikit-learn gensim matplotlib
!python -m spacy download en_core_web_sm

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m63.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencie

Load Dataset

In [4]:
import pandas as pd

df = pd.read_csv("tweets.csv")
print("Available columns:", df.columns)
df.head()

Available columns: Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')


Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0


Text Preprocessing

In [5]:
import re
import spacy

nlp = spacy.load("en_core_web_sm")

def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^A-Za-z ]", "", text)
    return text.lower()

df['clean_text'] = df['text'].apply(clean_text)


Part-of-Speech (POS) Tagging

In [6]:
doc = nlp(df['clean_text'][0])

for token in doc:
    print(token.text, "→", token.pos_)


communal → ADJ
violence → NOUN
in → ADP
bhainsa → PROPN
telangana → PROPN
stones → PROPN
were → AUX
pelted → VERB
on → ADP
muslims → NOUN
houses → NOUN
and → CCONJ
some → DET
houses → NOUN
and → CCONJ
vehicles → NOUN
were → AUX
set → VERB
ablaze → ADV


Named Entity Recognition (NER)

In [7]:
for ent in doc.ents:
    print(ent.text, "→", ent.label_)


bhainsa telangana → PERSON
muslims → NORP


Dependency Parsing

In [8]:
for token in doc:
    print(f"{token.text} → {token.dep_} → {token.head.text}")


communal → amod → violence
violence → nsubjpass → pelted
in → prep → violence
bhainsa → compound → stones
telangana → compound → stones
stones → pobj → in
were → auxpass → pelted
pelted → ROOT → pelted
on → prep → pelted
muslims → compound → houses
houses → pobj → on
and → cc → houses
some → det → houses
houses → conj → houses
and → cc → houses
vehicles → conj → houses
were → auxpass → set
set → conj → pelted
ablaze → advmod → set


Sentiment Analysis (Supervised)
Label Encoding

In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['sentiment_label'] = le.fit_transform(df['target'])

Train-Test Split

In [11]:
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df['sentiment_label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


TF-IDF Vectorization

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


Logistic Regression Model

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)
accuracy_score(y_test, y_pred)


0.8874230430958663

Topic Modeling (LDA)

In [14]:
from gensim import corpora
from gensim.models import LdaModel
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

texts = [
    [word for word in text.split() if word not in stop_words]
    for text in df['clean_text']
]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


LDA Model

In [15]:
lda = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=5,
    passes=10
)

lda.print_topics()


[(0,
  '0.007*"people" + 0.006*"suicide" + 0.005*"dont" + 0.004*"emergency" + 0.004*"im" + 0.004*"china" + 0.004*"weapons" + 0.004*"war" + 0.003*"work" + 0.003*"one"'),
 (1,
  '0.008*"volcano" + 0.006*"like" + 0.004*"amp" + 0.004*"nuclear" + 0.004*"terrorist" + 0.004*"trauma" + 0.003*"iran" + 0.003*"winds" + 0.003*"us" + 0.003*"force"'),
 (2,
  '0.010*"storm" + 0.006*"im" + 0.006*"like" + 0.004*"one" + 0.004*"get" + 0.004*"amp" + 0.004*"violent" + 0.004*"wreck" + 0.003*"wounds" + 0.003*"thats"'),
 (3,
 (4,
  '0.009*"amp" + 0.007*"taal" + 0.006*"like" + 0.005*"people" + 0.004*"im" + 0.004*"get" + 0.003*"fires" + 0.003*"one" + 0.003*"dont" + 0.003*"would"')]