In [None]:
# All Libraries required for this lab are listed below. The libraries pre-installed on Skills Network Labs are commented.
!pip install pandas==1.3.4
!pip install scikit-learn==1.0.2
!pip install numpy==1.21.6

In [None]:
# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression

In [None]:
# the data set is available at the url below.
URL = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0231EN-SkillsNetwork/datasets/mpg.csv"

# using the read_csv function in the pandas library, we load the data into a dataframe.

df = pd.read_csv(URL)

In [None]:
# show 5 random rows from the dataset
df.sample(5)

In [None]:
df.shape

In [None]:
df.plot.scatter(x = "Horsepower", y = "MPG")

In [None]:
target = df["MPG"]

In [None]:
features = df[["Horsepower","Weight"]]

## Create a linear regresssion model

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(features,target)

In [None]:
#Higher the score, better the model.
lr.score(features,target)

In [None]:
lr.predict([[100,2000]])

#  Build a Classifier Model using Logistic Regression

In [None]:
# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.linear_model import LogisticRegression

In [None]:
# the data set is available at the url below.
URL = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0231EN-SkillsNetwork/datasets/iris.csv"

# using the read_csv function in the pandas library, we load the data into a dataframe.

df = pd.read_csv(URL)

In [None]:
df.Species.value_counts().plot.bar()

In [None]:
target = df["Species"]

In [None]:
features = df[["SepalLengthCm","SepalWidthCm","PetalLengthCm","PetalWidthCm"]]

In [None]:
classifier = LogisticRegression()

In [None]:
classifier.fit(features,target)

In [None]:
#Higher the score, better the model.
classifier.score(features,target)

In [None]:
classifier.predict([[5.4,2.6,4.1,1.3]])

# Metrics for regression

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression

#import functions for train test split

from sklearn.model_selection import train_test_split

# import functions for metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
# the data set is available at the url below.
URL = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0231EN-SkillsNetwork/datasets/mpg.csv"

# using the read_csv function in the pandas library, we load the data into a dataframe.

df = pd.read_csv(URL)


In [None]:
df.plot.scatter(x = "Weight", y = "MPG")

In [None]:
y = df["MPG"] # y is the target

In [None]:
X = df[["Horsepower","Weight"]] # X is the set of features

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

#The random_state variable controls the shuffling applied to the data before applying the split. 
#Pass the same integer for reproducible output across multiple function calls

In [None]:
#Create a linear model
lr = LinearRegression()

In [None]:
#Train/Fit the model using the training data set
lr.fit(X_train,y_train)

In [None]:
lr.score(X_test, y_test)

In [None]:
#To compute the detailed metrics we need two values, the original mileage and the predicted mileage
original_values = y_test
predicted_values = lr.predict(X_test)


### R squared

In [None]:
r2_score(original_values, predicted_values) # Higher the value the better the model

### Mean Squared Error

In [None]:
mean_squared_error(original_values, predicted_values) # Lower the value the better the model

### Root Mean Squared Error

In [None]:
sqrt(mean_squared_error(original_values, predicted_values)) # Lower the value the better the model

### Mean Absolute Error

In [None]:
mean_absolute_error(original_values, predicted_values) # Lower the value the better the model

# Metrics for Classification

In [None]:
# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.linear_model import LogisticRegression

#import functions for train test split

from sklearn.model_selection import train_test_split


# functions for metrics

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [None]:
# the data set is available at the url below.
URL = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0231EN-SkillsNetwork/datasets/diabetes.csv"

# using the read_csv function in the pandas library, we load the data into a dataframe.

df = pd.read_csv(URL)

In [None]:
df.Outcome.value_counts()

In [None]:
df.Outcome.value_counts().plot.bar()

In [None]:
#defining target columns
y = df["Outcome"]

In [None]:
# Defining features columns
X = df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)

In [None]:
classifier = LogisticRegression()

In [None]:
classifier.fit(X_train,y_train)

In [None]:
#Higher the score, better the model.
classifier.score(X_test,y_test)

In [None]:
original_values = y_test
predicted_values = classifier.predict(X_test)

### Precision

In [None]:
precision_score(original_values, predicted_values) # Higher the value the better the model

### Recall

In [None]:
recall_score(original_values, predicted_values) # Higher the value the better the model

### F1 Score

In [None]:
f1_score(original_values, predicted_values) # Higher the value the better the model

### Confusion Matrix

In [None]:
confusion_matrix(original_values, predicted_values) # can be used to manually calculate various met

# Metrics for clustering

In [None]:
# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

In [None]:
# Generate sample data for clustering
X, y = make_blobs(n_samples=300, centers=4, cluster_std=0.6, random_state=0)

# X now contains 300 rows of data spread across 4 clusters that was generated by the make_blobs function.
# In real life we would use an existing data set.

In [None]:
# Apply k-means clustering
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)

In [None]:
# Print cluster centers
kmeans.cluster_centers_

In [None]:
# Plot the clusters and cluster centers
plt.scatter(X[:, 0], X[:, 1], c=kmeans.labels_, cmap='viridis')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker='*', s=400, color='black')
plt.show()

### Example

In [None]:
# the data set is available at the url below.
URL = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0231EN-SkillsNetwork/datasets/customers.csv"

# using the read_csv function in the pandas library, we load the data into a dataframe.

df = pd.read_csv(URL)

In [None]:
df.hist()

In [None]:
number_of_clusters = 3

In [None]:
#Create a KMeans clustering model
cluster = KMeans(n_clusters = number_of_clusters)

In [None]:
#Train the model on the dataset
result = cluster.fit_transform(df)

In [None]:
#Your model is now trained. Print cluster centers
cluster.cluster_centers_

## Make predictions

In [None]:
df['cluster_number'] = cluster.predict(df)

In [None]:
df.cluster_number.value_counts()