In [1]:
!pip install spacy



In [2]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [4]:
text = """the pateinet is diagnosed with bronchitis, he has 100 degree F fever, rbc count is 1 million, hb level is 14.  and is having discmfort in chest,
he also has mild fever lets have meeting with him this week by 23rd May"""

In [5]:
insight = nlp(text)

In [6]:
insight

the pateinet is diagnosed with bronchitis, he has 100 degree F fever, rbc count is 1 million, hb level is 14.  and is having discmfort in chest,
he also has mild fever lets have meeting with him this week by 23rd May

In [7]:
print("Tokens: ", [token.text for token in insight])

Tokens:  ['the', 'pateinet', 'is', 'diagnosed', 'with', 'bronchitis', ',', 'he', 'has', '100', 'degree', 'F', 'fever', ',', 'rbc', 'count', 'is', '1', 'million', ',', 'hb', 'level', 'is', '14', '.', ' ', 'and', 'is', 'having', 'discmfort', 'in', 'chest', ',', '\n', 'he', 'also', 'has', 'mild', 'fever', 'lets', 'have', 'meeting', 'with', 'him', 'this', 'week', 'by', '23rd', 'May']


In [8]:
print("POS TAGS: ", [(token.text, token.pos_)for token in insight])

POS TAGS:  [('the', 'DET'), ('pateinet', 'NOUN'), ('is', 'AUX'), ('diagnosed', 'VERB'), ('with', 'ADP'), ('bronchitis', 'NOUN'), (',', 'PUNCT'), ('he', 'PRON'), ('has', 'VERB'), ('100', 'NUM'), ('degree', 'NOUN'), ('F', 'NOUN'), ('fever', 'NOUN'), (',', 'PUNCT'), ('rbc', 'PROPN'), ('count', 'NOUN'), ('is', 'AUX'), ('1', 'NUM'), ('million', 'NUM'), (',', 'PUNCT'), ('hb', 'PROPN'), ('level', 'NOUN'), ('is', 'AUX'), ('14', 'NUM'), ('.', 'PUNCT'), (' ', 'SPACE'), ('and', 'CCONJ'), ('is', 'AUX'), ('having', 'VERB'), ('discmfort', 'NOUN'), ('in', 'ADP'), ('chest', 'NOUN'), (',', 'PUNCT'), ('\n', 'SPACE'), ('he', 'PRON'), ('also', 'ADV'), ('has', 'AUX'), ('mild', 'ADJ'), ('fever', 'NOUN'), ('lets', 'NOUN'), ('have', 'AUX'), ('meeting', 'VERB'), ('with', 'ADP'), ('him', 'PRON'), ('this', 'DET'), ('week', 'NOUN'), ('by', 'ADP'), ('23rd', 'ADJ'), ('May', 'PROPN')]


In [9]:
print("Lemma: ", [token.lemma_ for token in insight])

Lemma:  ['the', 'pateinet', 'be', 'diagnose', 'with', 'bronchitis', ',', 'he', 'have', '100', 'degree', 'f', 'fever', ',', 'rbc', 'count', 'be', '1', 'million', ',', 'hb', 'level', 'be', '14', '.', ' ', 'and', 'be', 'have', 'discmfort', 'in', 'chest', ',', '\n', 'he', 'also', 'have', 'mild', 'fever', 'let', 'have', 'meet', 'with', 'he', 'this', 'week', 'by', '23rd', 'May']


In [10]:
print("Nanmed Entity Recognition: ", [(ent.text, ent.label_)for ent in insight.ents])

Nanmed Entity Recognition:  [('100 degree', 'QUANTITY'), ('F', 'PRODUCT'), ('1 million', 'CARDINAL'), ('14', 'DATE'), ('this week', 'DATE'), ('23rd May', 'DATE')]


In [11]:
print("Sentences: ", [sent.text for sent in insight.sents])

Sentences:  ['the pateinet is diagnosed with bronchitis, he has 100 degree F fever, rbc count is 1 million, hb level is 14.  and is having discmfort in chest,\nhe also has mild fever lets have meeting with him this week by 23rd May']


In [12]:
filtered = [token.text for token in insight if not token.is_stop and not token.is_punct]

In [13]:
filtered

['pateinet',
 'diagnosed',
 'bronchitis',
 '100',
 'degree',
 'F',
 'fever',
 'rbc',
 'count',
 '1',
 'million',
 'hb',
 'level',
 '14',
 ' ',
 'having',
 'discmfort',
 'chest',
 '\n',
 'mild',
 'fever',
 'lets',
 'meeting',
 'week',
 '23rd']

In [14]:
summary = sorted(insight.sents, key = lambda sent: len(sent.text), reverse=True)[0]

In [15]:
summary

the pateinet is diagnosed with bronchitis, he has 100 degree F fever, rbc count is 1 million, hb level is 14.  and is having discmfort in chest,
he also has mild fever lets have meeting with him this week by 23rd May

In [16]:
!pip install textblob



In [17]:
from textblob import TextBlob

In [18]:
blob = TextBlob(text)

In [19]:
blob.sentiment.polarity

0.3333333333333333

In [20]:
blob.sentiment.subjectivity

0.5

In [21]:
text = """ 1234, 5678, 345 kurtosis, 45, skewness, dividend is 40%"""

In [22]:
import pandas as pd

In [23]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from spacy.training.example import Example
import random

# Step 1: Load IMDb dataset
url = "https://raw.githubusercontent.com/SK7here/Movie-Review-Sentiment-Analysis/master/IMDB-Dataset.csv"
df = pd.read_csv(url)
df['sentiment'] = df['sentiment'].map({'positive': 'POSITIVE', 'negative': 'NEGATIVE'})

# Step 2: Split data
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

# Step 3: Create blank English pipeline
nlp = spacy.blank("en")

# Step 4: Add TextCategorizer with FULL config
textcat = nlp.add_pipe("textcat", config={
    "model": {
        "@architectures": "spacy.TextCatBOW.v1",
        "exclusive_classes": True,
        "ngram_size": 1,
        "no_output_layer": False
    }
})
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")

# Step 5: Prepare training data
train_data = []
for text, label in zip(X_train, y_train):
    cats = {"POSITIVE": label == "POSITIVE", "NEGATIVE": label == "NEGATIVE"}
    doc = nlp.make_doc(text)
    train_data.append(Example.from_dict(doc, {"cats": cats}))

# Step 6: Train model
optimizer = nlp.initialize()
for i in range(5):
    random.shuffle(train_data)
    losses = {}
    batches = spacy.util.minibatch(train_data, size=8)
    for batch in batches:
        nlp.update(batch, drop=0.3, losses=losses)
    print(f"Epoch {i+1}, Loss: {losses['textcat']}")

# Step 7: Prediction function
def predict_sentiment(text):
    doc = nlp(text)
    return "POSITIVE" if doc.cats["POSITIVE"] > doc.cats["NEGATIVE"] else "NEGATIVE"

# Step 8: Evaluate
y_pred = [predict_sentiment(text) for text in X_test]
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Epoch 1, Loss: 479.13887918381783
Epoch 2, Loss: 252.9664982878403
Epoch 3, Loss: 166.01301221350815
Epoch 4, Loss: 118.34696407708763
Epoch 5, Loss: 88.28297006969021
Accuracy: 0.8917
Classification Report:
               precision    recall  f1-score   support

    NEGATIVE       0.91      0.87      0.89      4961
    POSITIVE       0.88      0.91      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [24]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the IMDb dataset (raw CSV file from GitHub)
url = "https://raw.githubusercontent.com/SK7here/Movie-Review-Sentiment-Analysis/master/IMDB-Dataset.csv"
df = pd.read_csv(url)

# Map sentiment labels to binary values
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Split dataset into features and labels
X = df['review']
y = df['sentiment']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_vec)

# Evaluate model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Optional: Function to predict sentiment for new text
def predict_sentiment(text):
    vec = vectorizer.transform([text])
    prediction = model.predict(vec)
    return "Positive" if prediction[0] == 1 else "Negative"

# Example
print("Example Prediction:", predict_sentiment("This movie was absolutely fantastic!"))


Accuracy: 0.895
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.88      0.89      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.89      0.89     10000
weighted avg       0.90      0.90      0.89     10000

Example Prediction: Positive


In [26]:
# Task 19: % of stock items sold
sold_items = sales_df['StockCode'].nunique()
total_items = stock_df['StockCode'].nunique()
answers["Task 19"] = f"{(sold_items / total_items) * 100:.2f}%"

NameError: name 'sales_df' is not defined