# Session 7

Review

Files need
* ACMETelephoneABT.csv
* imdb_reviews.csv
---


Michael de la Maza

AI/ML

Hult International Business School

Based on "Fundamentals of Machine Learning for Predictive Data Analytics" by Kelleher, et al

# ACME Telephone Case Study



*   Decision Tree
*   Random Forest
*   XGBoost
*   MLPClassifier





#### Load data and check

In [2]:
# Load dataset
import pandas as pd
df = pd.read_csv('ACMETelephoneABT.csv')

In [None]:
# Check
print(df.head())

In [None]:
# Descriptive statistics
descriptive_stats = df.describe(include='all')
print(descriptive_stats)

In [5]:
# Exercise - 5 minutes
# Examine the descriptive statistics. What do you see?

In [None]:
# Histograms for numerical attributes

import matplotlib.pyplot as plt
df.hist(figsize=(20, 16), bins=10, grid=True)
plt.show()

In [None]:
# Exercise - 5 minutes
# What do you see in the histograms?
# Look at 'lastMonthCustomerCareCalls'? What does this indicate?

In [None]:
# Show attribute names
print(df.columns)
print(df.dtypes)

In [None]:
# Create stacked bar chart for regionType by churn
grouped = df.groupby([' regionType', 'churn']).size().groupby(level=0).apply(lambda x: 100 * x / x.sum()).unstack('churn')

grouped.plot(kind='bar', stacked=True, figsize=(10, 6))

plt.title('Stacked Bar Chart of Region Type by Churn')
plt.xlabel('Region Type')
plt.ylabel('Count')

plt.show()

In [None]:
# Exercise - 5 minutes
# Select another attribute of interest
# Create a stacked bar chart. Analyze.

In [None]:
# Create stacked bar chart for Occupation by churn
grouped = df.groupby([' occupation', 'churn']).size().groupby(level=0).apply(lambda x: 100 * x / x.sum()).unstack('churn')

grouped.plot(kind='bar', stacked=True, figsize=(10, 6))

plt.title('Stacked Bar Chart of Occupation by Churn')
plt.xlabel('Occupation')
plt.ylabel('Count')

plt.show()

In [9]:
# Remove spaces from churn attribute
df['churn'] = df['churn'].str.strip()

### Simple Decision Tree

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Select features and target
X = df.drop('churn', axis=1)
y = df['churn']

# Convert categorical variables to numeric using one-hot encoding
X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Fit
dt_classifier.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
y_pred = dt_classifier.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

In [None]:
# Plot tree
from sklearn import tree

# Very slow!

plt.figure(figsize=(20,10))
tree.plot_tree(dt_classifier, filled=True)
plt.show()

In [16]:
# Exercise - 5 minutes
# Very large decision tree! Poor accuracy.
# Why?

In [None]:
# Depth
dt_classifier.get_depth()

In [None]:
# Define classifier with smaller depth
dt_classifier = DecisionTreeClassifier(random_state=42, max_depth=4)

# Fit
dt_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = dt_classifier.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


In [None]:
# Plot tree
plt.figure(figsize=(20,10))
tree.plot_tree(dt_classifier, filled=True, feature_names=X.columns)
plt.show()

In [20]:
# Exercise - 5 min
# Can you find a setting for the max_depth hyperparameter
# that produces an accuracy above 60%?


#### Random Forest

In [None]:
# Create and train the random forest classifier

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=200, max_depth=20)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.2f}")



In [None]:
# Exercise - 5 minutes
# Can you find a setting of the n_estimator and max_depth hyperparameters
# that produces an accuracy above 65%?

### XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# Convert [true false] to [1 0] because XGB requires this
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)  # Adjust parameters as needed
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Accuracy: {accuracy:.2f}")

### Simple neural network

In [None]:
X_train.head()

In [None]:
# Neural network

from sklearn.neural_network import MLPClassifier

# Create the neural network model
clf = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', alpha=0.001, max_iter=100)

# Train the model
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

### Neural network with several hidden layers

In [None]:
clf = MLPClassifier(hidden_layer_sizes=(64, 32, 16, 8, 4), activation='relu', solver='adam', alpha=0.001, max_iter=100)

# Train the model
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

In [None]:
df

In [34]:
# But wait! We forgot to normalize all of the values between 0 and 1

from sklearn.preprocessing import  MinMaxScaler

# Create a MinMaxScaler object
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit the scaler on the training data (excluding the target variable)
scaler.fit(X_train)

# Transform the training features using the scaler
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
clf = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', alpha=0.001, max_iter=200)

# Train the model
clf.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Natural Language Processing
## IMDB Movies Sentiment Analysis

Adapted from AI Publishing

In [38]:
movie_dataset = pd.read_csv('imdb_reviews.csv', engine='python')

In [None]:
movie_dataset.head()

In [None]:
movie_dataset.shape

In [None]:
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = [8,10]

movie_dataset.label.value_counts().plot(kind='pie', autopct="%1.0f%%")

In [42]:
X = movie_dataset.text
y = movie_dataset.label

In [43]:
import re

def clean_text(doc):
  # Keep letters only
  document = re.sub('[^a-zA-Z]',' ',doc)
  # Remove single characters
  document = re.sub(r"\s+[a-zA-Z]\s+",' ',document)
  # Remove multiple empty spaces
  document = re.sub(r'\s+',' ',document)

  return document

In [44]:
# Clean reviews

X_sentences = []
reviews = list(X)
for rev in reviews:
  X_sentences.append(clean_text(rev))

In [None]:
X_sentences[0:4]

In [None]:
# Stop words and TFIDF
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# max_feature = 2000 => 2000 most common words
# min_df = 5 => word must occur 5 times across all documents
# max_df = 0.7 => word cannot occur in more than 70% of documents
vectorizer = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))

X = vectorizer.fit_transform(X_sentences).toarray()

In [None]:
X[0:4]

In [48]:
# Split into training and testing set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=250, random_state=42)
# Takes 20 seconds
clf.fit(X_train, y_train)

In [None]:
# Performance

y_pred = clf.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay

print(confusion_matrix(y_test, y_pred))
disp=ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred))
disp.plot()
plt.show()

print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

In [None]:
# 5 minute exercise
# Interpret the confusion matrix, the classification report, and the accuracy


In [None]:
# Predict an instance
# 1 = positive sentiment, 0 = negative sentiment
print(clf.predict(vectorizer.transform(["The movie was excellent. I loved it."])))
print(clf.predict(vectorizer.transform(["The movie was terrible. I hated it."])))

In [None]:
#5 minute exercis
# Try other movie reviews. Are the results what you expected?

# Convolutional Neural Network
## Classifying objects in images

Adapted from "AI and Deep Learning" by D'Ascoli


In [53]:
import tensorflow as tf
from tensorflow.keras import layers, models
import matplotlib.pyplot as plot
from keras.datasets import cifar10



In [None]:
# Load data
(X_train, Y_train), (X_test, Y_test) = cifar10.load_data()

X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

In [None]:
# Plot images

import matplotlib.pyplot as plt

images = X_train[:20]

fig, axs = plt.subplots(4, 5, figsize=(10, 5))

for idx, ax in enumerate(axs.ravel()):
    ax.imshow(images[idx], cmap='gray')
    ax.set_title(f"Label: {Y_train[idx]}")
    ax.axis('off')

plt.show()

# 0 airplane
# 1 automobile (cars, not trucks or pickup trucks)
# 2 bird
# 3 cat
# 4 deer
# 5 dog
# 6 frog
# 7 horse
# 8 ship
# 9 truck (not pickup trucks)

In [56]:
# Scale values

X_train, X_test = X_train/255.0, X_test/255.0

In [57]:
# Build network
# First layer: 50 convolutional filters, 2x2

CNN_model = models.Sequential()
CNN_model.add(layers.Conv2D(50, (2, 2), activation='relu', input_shape=(32, 32, 3)))
CNN_model.add(layers.MaxPooling2D((3,3)))
CNN_model.add(layers.Flatten())
CNN_model.add(layers.Dense(50, activation='relu'))
CNN_model.add(layers.Dense(10, activation='softmax'))




In [58]:
optimizer = tf.optimizers.Adam(learning_rate = .005)
CNN_model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), metrics=['accuracy'])


In [59]:
# Train model
# take 10 minutes
# Default batch size is 32
history = CNN_model.fit(X_train, Y_train, epochs=5, validation_data=(X_test, Y_test))



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Plot performance
import matplotlib.ticker as ticker

plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc = 'upper left')
plt.xticks(range(0, len(history.history['loss'])))
plt.show()

In [None]:
# Print structure

CNN_model.summary()


In [None]:
# The End