## NLP Pipeline
Overview of full text processing pipeline: text input → preprocessing → vectorization → model → output.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Sample dataset
df = pd.DataFrame({
    'text': ["I am happy", "This is bad", "I love it", "I hate it"],
    'label': [1, 0, 1, 0]
})

# Train-test split with stratification for class balance
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'],
    test_size=0.5, random_state=42,
    stratify=df['label']
)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Logistic Regression Model
clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train)

# Predictions and Evaluation
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, zero_division=0))


              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



## Text Preprocessing
Steps include: tokenization, lowercasing, punctuation removal, stopword removal, stemming, and lemmatization.

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

text = "Cats are running, dogs are barking."
tokens = word_tokenize(text.lower())
stop_words = set(stopwords.words('english'))
filtered = [t for t in tokens if t.isalpha() and t not in stop_words]
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(t) for t in filtered]
print(lemmas)

[nltk_data] Downloading package punkt to /home/sathwik-
[nltk_data]     itthagoni/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/sathwik-
[nltk_data]     itthagoni/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/sathwik-
[nltk_data]     itthagoni/nltk_data...


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/sathwik-itthagoni/nltk_data'
    - '/home/sathwik-itthagoni/miniconda3/envs/nlp_env/nltk_data'
    - '/home/sathwik-itthagoni/miniconda3/envs/nlp_env/share/nltk_data'
    - '/home/sathwik-itthagoni/miniconda3/envs/nlp_env/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


## Assignment


In [25]:
import requests
import time
import pandas as pd
import re


In [26]:
def remove_html_tags(text):
    """Remove HTML tags from a string."""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)
   

In [41]:


def remove_urls(text):
    """
    Replace URLs (http, https, www, or bare domains) with a [LINK] token.
    """
    # Match http(s), www., or bare domain names
    url_pattern = re.compile(
        r'(http[s]?://\S+|'         # http or https
        r'www\.\S+|'                # www.
        r'\b(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}(?:/\S*)?)'  # domain.com or domain.co.uk/path
    )

    return re.sub(url_pattern, '[LINK]', text)


In [None]:
def tokenize_text(text):
    """
    Tokenize text into words.
    """
    return text.split()

In [71]:
import string
def rem_punctuation(text):
    """
    Remove punctuation from a string.
    """
    exclude = string.punctuation
    return text.translate(str.maketrans('', '', exclude))

In [None]:
sample_punctuation = "Hello, world! This is a test: remove punctuation. I am in New-York"
cleaned_text = rem_punctuation(sample_punctuation)
print(cleaned_text)

Hello world This is a test remove punctuationI am in NewYork


In [53]:
from nltk.corpus import stopwords
def remove_stopwords(text):
    """
    Remove stopwords from a string.
    """
    stop_words = set(stopwords.words('english'))
    words = text.split()
    return ' '.join([word for word in words if word.lower() not in stop_words])

In [61]:
def remove_emojis(text):
    """
    Remove emojis from a string.
    """
    emoji_pattern = re.compile(
        "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"  # dingbats
            u"\U000024C2-\U0001F251"  # enclosed characters
        "]+", flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

In [None]:
import emoji
def replace_emojis(text):
    """
    Remove emojis from a string.
    """
    return emoji.demojize(text)

In [60]:
sample_emoji_text = "Hello 😊, this is a test with emojis! 🚀"
sample_emoji_text = remove_emojis(sample_emoji_text)
print(sample_emoji_text)

Hello :smiling_face_with_smiling_eyes:, this is a test with emojis! :rocket:


In [54]:
sample_stopwords = "This is a sample sentence with some stopwords."
cleaned_text = remove_stopwords(sample_stopwords)
print(cleaned_text)

sample sentence stopwords.


In [62]:
sample_punctuation = "Hello, world! This is a test: remove punctuation. I am in New-York"
cleaned_text = rem_punctuation(sample_punctuation)
print(cleaned_text)

Hello world This is a test remove punctuation I am in NewYork


In [51]:
from textblob import TextBlob
def correct_spelling(text):
    """
    Correct spelling in a string using TextBlob.
    """
    return str(TextBlob(text).correct())

In [50]:
sample_wrong_spelling = "I havv a dreem that one day this naation will rise up."
corrected_text = correct_spelling(sample_wrong_spelling)
print(corrected_text)

I have a dream that one day this nation will rise up.


In [27]:
sample = 'hi this is a <b>sample</b> text with <i>HTML</i> tags.'

In [30]:
sample_url = 'Check this link: https://example.com and this one: http://test.com'

In [31]:
remove_urls(sample_url)


'Check this link:  and this one: '

In [28]:
remove_html_tags(sample)


'hi this is a sample text with HTML tags.'

In [77]:
import spacy
nlp = spacy.load('en_core_web_sm')
def tokenize_text(text):
    """
    Tokenize text into words.
    """
    doc = nlp(text)
    return [tokens.text for tokens in doc ]

In [80]:
sample_tokenization = "Cats are running, dogs are barking. I am in A.I lab."
tokens = tokenize_text(sample_tokenization)
print(tokens)

['Cats', 'are', 'running', ',', 'dogs', 'are', 'barking', '.', 'I', 'am', 'in', 'A.I', 'lab', '.']


In [88]:
from nltk.stem.porter import PorterStemmer
def stem_text(text):
    """
    Stem words in a string using PorterStemmer.
    """
    stemmer = PorterStemmer()
    words = text.split()
    return ' '.join([stemmer.stem(word) for word in words])

In [86]:
import nltk
from nltk.stem import WordNetLemmatizer

word_net_lemmatizer = WordNetLemmatizer()
def lemmatize_word(text):
    """
    Lemmatize a word using WordNetLemmatizer.
    """
    return [word_net_lemmatizer.lemmatize(word,pos='v') for word in tokenize_text(text)]

In [87]:
sample_lemmatization = "Cats are running, dogs are barking. I am in A.I lab."
lemmatized_words = lemmatize_word(sample_lemmatization)
print(lemmatized_words)

['Cats', 'be', 'run', ',', 'dog', 'be', 'bark', '.', 'I', 'be', 'in', 'A.I', 'lab', '.']


In [9]:
import requests
import time
import pandas as pd

base_url = "https://api.themoviedb.org/3/movie/top_rated"
api_key = "8265bd1679663a7ea12ac168da84d2e8"
language = "en-US"
all_movies = []

for page in range(1, 472):
    url = f"{base_url}?api_key={api_key}&language={language}&page={page}"
    for attempt in range(3):  # Retry up to 3 times
        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                data = response.json()
                movies = [
                    {
                        "name": movie.get("title"),
                        "description": movie.get("overview"),
                        "genre": movie.get("genre_ids")
                    }
                    for movie in data.get("results", [])
                ]
                all_movies.extend(movies)
                print(f"Fetched page {page}")
                break
            else:
                print(f"Page {page} failed with status code: {response.status_code}")
        except Exception as e:
            print(f"Attempt {attempt+1} failed for page {page}: {e}")
            time.sleep(1)
    time.sleep(0.3)  # Sleep to avoid rate limiting

# Create DataFrame
all_movies_df = pd.DataFrame(all_movies)
print(all_movies_df.head())


Fetched page 1
Fetched page 2
Fetched page 3
Attempt 1 failed for page 4: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Fetched page 4
Fetched page 5
Fetched page 6
Fetched page 7
Fetched page 8
Fetched page 9
Fetched page 10
Fetched page 11
Attempt 1 failed for page 12: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Fetched page 12
Fetched page 13
Fetched page 14
Attempt 1 failed for page 15: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Fetched page 15
Fetched page 16
Fetched page 17
Attempt 1 failed for page 18: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Attempt 2 failed for page 18: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Fetched page 18
Fetched page 19
Fetched page 20
Attempt 1 failed for page 21: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Fetched page 21
Fetched page 22
Fetc

In [12]:
all_movies_df['genre']

0                   [18, 80]
1                   [18, 80]
2                   [18, 80]
3            [18, 36, 10752]
4                       [18]
                ...         
9335           [18, 53, 878]
9336    [14, 53, 28, 12, 27]
9337                [35, 27]
9338          [27, 9648, 14]
9339                [35, 27]
Name: genre, Length: 9340, dtype: object

In [13]:
url = "https://api.themoviedb.org/3/genre/movie/list?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US"  # Replace with the actual URL

response = requests.get(url)
if response.status_code == 200:
    # Assuming the data is in JSON format
    data = response.json()
    
    # Extract relevant fields (adjust keys based on the actual structure of the JSON)
    genre_mapping = {
        genre['id']: genre['name'] for genre in data.get("genres", [])
    }
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")

In [14]:
all_movies_df['genre'] = all_movies_df['genre'].apply(
    lambda x: ', '.join(genre_mapping.get(genre_id, 'Unknown') for genre_id in x) if isinstance(x, list) else x
)

In [23]:
all_movies_df['description'] = all_movies_df['description'].apply(
    lambda x: x.lower() if isinstance(x, str) else "No description available" 
)           #FIRST CONVERT TO LOWER CASE 


In [92]:
all_movies_df['description'] = all_movies_df['description'].apply(
   remove_html_tags
)           #FIRST CONVERT TO LOWER CASE 


In [90]:
all_movies_df['description'] = all_movies_df['description'].apply(
   remove_urls
)           #FIRST CONVERT TO LOWER CASE 


In [None]:
all_movies_df['description'] = all_movies_df['description'].apply(
   rem_punctuation
)           #FIRST CONVERT TO LOWER CASE 
all_movies_df['description'] = all_movies_df['description'].apply(
   remove_stopwords
)           #FIRST CONVERT TO LOWER CASE 
all_movies_df['description'] = all_movies_df['description'].apply(
   correct_spelling
)           #FIRST CONVERT TO LOWER CASE 
all_movies_df['description'] = all_movies_df['description'].apply(
   remove_html_tags
)           #FIRST CONVERT TO LOWER CASE 
all_movies_df['description'] = all_movies_df['description'].apply(
   tokenize_text
)           #FIRST CONVERT TO LOWER CASE 
all_movies_df['description'] = all_movies_df['description'].apply(
  stem_text
)           #FIRST CONVERT TO LOWER CASE 

all_movies_df['description'] = all_movies_df['description'].apply(
   lemmatize_word
)           #FIRST CONVERT TO LOWER CASE 


## Bag of Words & TF-IDF
Vectorizing text using BoW and TF-IDF using sklearn. Comparing sparsity and feature analysis.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

docs = ["I love NLP", "NLP is fun", "I hate bugs"]

cv = CountVectorizer()
bow = cv.fit_transform(docs)
print("BoW features:", cv.get_feature_names_out(), bow.toarray())

tfidf = TfidfTransformer()
tfidf_vec = tfidf.fit_transform(bow)
print("TF-IDF array:\n", tfidf_vec.toarray())

## Word2Vec
Training CBOW and Skip-Gram models using Gensim or PyTorch. Visualizing embeddings.

In [None]:
from gensim.models import Word2Vec

sentences = [["I", "love", "NLP"], ["NLP", "is", "fun"], ["we", "love", "learning"]]
model = Word2Vec(sentences, vector_size=50, window=2, min_count=1, epochs=50)
print(model.wv["nlp"])

## Text Classification
Using logistic regression or a neural network on BoW/TF-IDF vectors or embeddings.

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

cats = ['alt.atheism', 'soc.religion.christian']
data = fetch_20newsgroups(subset='train', categories=cats, shuffle=True, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=cats, shuffle=True, random_state=42)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])

pipeline.fit(data.data, data.target)
pred = pipeline.predict(data_test.data)
print(classification_report(data_test.target, pred))