## Solution for Question 34 in the PDF file containing preparatory questions for the final exam on ML & NLP:

In [None]:
# Importing some required libraries and dependencies:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [None]:
# Step 1: Defining the given dataset

data = {
    'Age': [25, 30, 35, 40, 55],
    'Salary': [50000, 60000, 70000, 80000, 85000],
    'Purchased': ['No', 'Yes', 'Yes', 'No', 'Yes']
}

df = pd.DataFrame(data)

# Let's display the created dataframe:
df

Unnamed: 0,Age,Salary,Purchased
0,25,50000,No
1,30,60000,Yes
2,35,70000,Yes
3,40,80000,No
4,55,85000,Yes


In [None]:
# Step 2: Creating a custom function for Min_max normalization

def min_max_normalize(column):

    min_val = column.min()
    max_val = column.max()

    result = (column - min_val) / (max_val - min_val)

    return result

# Normalizing the values in the Salary column:
df['Normalized_salary'] = min_max_normalize(df['Salary'])

# Let's display the modified df:
df

Unnamed: 0,Age,Salary,Purchased,Normalized_salary
0,25,50000,No,0.0
1,30,60000,Yes,0.285714
2,35,70000,Yes,0.571429
3,40,80000,No,0.857143
4,55,85000,Yes,1.0


In [None]:
df['Purchased_numerics'] = df['Purchased'].map({'Yes': 1, 'No': 0})

# Let's display the modified df:
df

Unnamed: 0,Age,Salary,Purchased,Normalized_salary,Purchased_numerics
0,25,50000,No,0.0,0
1,30,60000,Yes,0.285714,1
2,35,70000,Yes,0.571429,1
3,40,80000,No,0.857143,0
4,55,85000,Yes,1.0,1


In [None]:
# Training and Testing a Naive Bayes classifier model:
inputs = df[['Age', 'Normalized_salary']]
target = df['Purchased_numerics']

X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.3, random_state=42)

# Fitting the Naive Bayes model to the training set:
model = GaussianNB()
model.fit(X_train, y_train)

# Predicting the test set results:
y_pred = model.predict(X_test)

# Comparing model's predictions with the actual target values from testing set:

print('Model predictions on the testing set:', y_pred)

print('The actual target values on the testing set: ', y_test.values)

Model predictions on the testing set: [0 0]
The actual target values on the testing set:  [1 1]


## Solution for Question 35 in the PDF file containing preparatory questions for the final exam on ML & NLP (`Needs Higher Attention`):

In [None]:
# Importing required libaries and dependenceies:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import spacy

nlp = spacy.load('en_core_web_sm')

In [None]:
# Step 1. Defining the given dataset in code:

data = {
    'ID': [1, 2, 3, 4, 5],
    'Text': [
        'I love this product!',
        'The quality is terrible!',
        'Worth every penny, highly recommended!',
        'Not what I expected, disappointed',
        'Very good book. Highly recommended'
    ],

    'Sentiment': ['Positive', 'Negative', 'Positive', 'Negative', 'Positive']
}

df = pd.DataFrame(data)

# Displaying the dataframe:
df

Unnamed: 0,ID,Text,Sentiment
0,1,I love this product!,Positive
1,2,The quality is terrible!,Negative
2,3,"Worth every penny, highly recommended!",Positive
3,4,"Not what I expected, disappointed",Negative
4,5,Very good book. Highly recommended,Positive


In [None]:
# Performing TOKENIZATION and removing STOP WORDS from the given sentences:

def preprocess_text(text):

    doc = nlp(text.lower())

    tokens = []

    for word in doc:
       if word.is_alpha and not word.is_stop:
          tokens.append(word.text)

    return " ".join(tokens)


df['Processed_text'] = df['Text'].apply(preprocess_text)

df['Sentiment_numerics'] = df['Sentiment'].map({'Positive': 1, 'Negative': 0})

# Let's display the modfied DF:#
df

Unnamed: 0,ID,Text,Sentiment,Processed_text,Sentiment_numerics
0,1,I love this product!,Positive,love product,1
1,2,The quality is terrible!,Negative,quality terrible,0
2,3,"Worth every penny, highly recommended!",Positive,worth penny highly recommended,1
3,4,"Not what I expected, disappointed",Negative,expected disappointed,0
4,5,Very good book. Highly recommended,Positive,good book highly recommended,1


In [None]:
# Splitting the DF into training and testing splits:

inputs = df['Processed_text']
target = df['Sentiment_numerics']

# Converting the inputs into numeric vectors:
vectorizer = CountVectorizer()

inputs_vectorized = vectorizer.fit_transform(inputs)

X_train, X_test, y_train, y_test = train_test_split(inputs_vectorized, target, test_size=0.3, random_state=42)

# Training the model:
model = LogisticRegression()
model.fit(X_train, y_train)

# Testing the model:
y_pred = model.predict(X_test)

# Comparing model's predictions to the actual target values in testing set:

print('Model predictions on the testing set: ', y_pred)
print('The actual target values on testing set: ', y_test.values)

Model predictions on the testing set:  [1 1]
The actual target values on testing set:  [0 1]


## Solution for Question 36 in the PDF file containing preparatory questions for the final exam on ML & NLP (`Needs Higher Attention`):

In [1]:
# Import the required dependencies:

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Define the given dataset in the question in code format:
data = {
    'Student': ['Alice', 'Bob', 'Carol', 'Dave'],
    'Hours Studied': [2, 4, 6, 8],
    'Test Score': [50, 70, 80, 90]
}

df = pd.DataFrame(data)

# Displaying the generated DF:
df

Unnamed: 0,Student,Hours Studied,Test Score
0,Alice,2,50
1,Bob,4,70
2,Carol,6,80
3,Dave,8,90


In [2]:
# Identifying inputs and target variable:
X_input = np.array(df['Hours Studied'])
y_target = np.array(df['Test Score'])

# Training the Linear regression model:
Linear_model = LinearRegression()
Linear_model.fit(X_input.reshape(-1, 1), y_target)

# Let's test our model:
predicted_score = Linear_model.predict([[5]])

print('The predicted score for a student studying for 5 hours: ', predicted_score)

The predicted score for a student studying for 5 hours:  [72.5]


## **Exercise 1 (`Needs Higher Attention`):**
- From the given Text below, Count the number of stop words in it.
- Print the percentage of stop word tokens compared to all tokens in a given text.

In [4]:
#import spacy and loading the english language model configurations:

import spacy
nlp = spacy.load("en_core_web_sm")


text = '''
Thor: Love and Thunder is a 2022 American superhero film based on Marvel Comics featuring the character Thor, produced by Marvel Studios and
distributed by Walt Disney Studios Motion Pictures. It is the sequel to Thor: Ragnarok (2017) and the 29th film in the Marvel Cinematic Universe (MCU).
The film is directed by Taika Waititi, who co-wrote the script with Jennifer Kaytin Robinson, and stars Chris Hemsworth as Thor alongside Christian Bale, Tessa Thompson,
Jaimie Alexander, Waititi, Russell Crowe, and Natalie Portman. In the film, Thor attempts to find inner peace, but must return to action and recruit Valkyrie (Thompson),
Korg (Waititi), and Jane Foster (Portman)—who is now the Mighty Thor—to stop Gorr the God Butcher (Bale) from eliminating all gods.
'''

#step1: Create the object 'doc' for the given text using nlp()
doc = nlp(text)


#step2: define the variables to keep track of stopwords count and total words count
stop_words_count = 0
total_words_count = 0


#step3: iterate through all the words in the document
for token in doc:
  if token.is_stop:         #check whether given token is stop word or not and increment accordingly
    stop_words_count += 1
  total_words_count +=  1   #increment the total_words_count


#step4: print the count of stop words & total number of words in the text:
print('Total number of words in the given text: ', total_words_count)
print(f"Total Stop words presented in the given text: {stop_words_count}")


#step5: print the percentage of stop words compared to total words in the text
percentage_stop_words = (stop_words_count / total_words_count) * 100
print(f"Percentage of Stop words presented in the given text: {percentage_stop_words} %")

Total number of words in the given text:  160
Total Stop words presented in the given text: 40
Percentage of Stop words presented in the given text: 25.0 %
