In [3]:
#Downloading the Titanic dataset from Kaggle and save it to orig_df, a copy is provided with this exercise and you can upload it to your drive folder.
import pandas as pd
df = pd.read_csv('titanic3.csv')
orig_df = df

In [4]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [None]:
#Explore the Titanic dataset
#Print all samples and check how many samples and features the Titanic dataset has
print(orig_df)

In [None]:
#Check the type of each variable (data type)
#int type is ok, float64 is ok as well but you may change it to int, object type need to be changed to int (object is a string in pandas and perform a string operation)
df.dtypes

In [None]:
#Use shape attribute to check the raws (samples) and columns (features)
df.shape

In [None]:
#Use describe attribute to explore the data statistics
#Can you let me a little bit about the data, for example the age groups
df.describe()

In [None]:
#Use describe attribute at different location to explore the data statistics
#Use 3 or 4 instead of 2 to include more features (this is useful when you have lots of features)
df.describe().iloc[:,:3]

In [None]:
#Use isnull() to find columns or rows with missing values and sum them up to get the total of missing values
#Which features are the leak features?
df.isnull().sum()

In [None]:
#We can create a boolean array (a series with True or False to indicate if a row (a sample) has missing data)
#and use it to inspect rows that are missing data
mask = df.isnull()

In [None]:
mask.head()  # rows

In [None]:
#Let's improve the process by using the function any that iterate through each row and return true for any x in the raw = true
mask = df.isnull().any(axis=1)

In [None]:
mask.head()

In [None]:
df[mask].body.head() # check body column

In [None]:
df[mask].age.head() # check age column


In [None]:
df[mask].embarked.head() # check embarked column

In [None]:
#Use the .value_counts method to examine the counts of the values:
df.sex.value_counts(dropna=False) # How many male and female
# Assign dropna to false if you don't want to delete the missing values

In [None]:
#Use the .value_counts method to examine the counts of the values:
df.embarked.value_counts(dropna=False)

In [None]:
#Use the .value_counts method to examine the counts of the values:
df.age.value_counts(dropna=False)

In [None]:
#Delete raws with high percentage of missing values
df = df.drop(
     columns=[
         "name",
         "ticket",
         "home.dest",
         "boat",
         "body",
         "cabin",
     ]
 )

In [None]:
#Use the attribute describe to check whether you managed to delete the columns
#Compare it with the above df.describe()
df.describe()

In [None]:
#Working with missing data

#Populate age missing values with thier median

df['age'] = df['age'].fillna(df['age'].median())

#Populate embarked missing values with high occurrence value

df['embarked'] = df['embarked'].fillna('S')

# map sex to a numeric type
df.sex = df.sex.map({'male': 1, 'female': 0})

# map embarked to a numeric type
df.embarked = df.embarked.map({'S': 2, 'C': 1, 'Q':0})

#fill any other missing value with 0 (is not good practice but to avoid common error of NaN value still exist)
df.fillna(0,inplace=True)

In [None]:
print(df.sex)

In [None]:
#Splitting data into training and testing datasets
from sklearn.model_selection import train_test_split
#Assign survived column (targets) to y
y = df.survived
#Delete survived column from X (samples)
X = df.drop(columns="survived")
#Now split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=11)

In [None]:
#check the y_train (target)
print(y_train)

In [None]:
#check the X_train (samples)
print(X_train)

In [None]:
#call the ML algorithm
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='liblinear')
clf.fit(X_train, y_train)

In [None]:
#Get the predicted and expected
#Can you tell what is predicted and expected values represent?
#Can you derive the misclassified values (wrong)
predicted = clf.predict(X=X_test)
expected = y_test


In [None]:
#Now print the model accuracy
print(f'{clf.score(X_test, y_test):.2%}')
clf.predict(X_test)

In [None]:
#Can you evaluate and validate the model using k-fold?
#Can you get the confusion matrix?

In [None]:
#Save and test the model
import pickle

# Save the trained model as a pickle string.
saved_model = pickle.dumps(clf)

# Load the pickled model
clf_from_pickle = pickle.loads(saved_model)

# Use the loaded pickled model to make predictions
clf_from_pickle.predict(X_test)

In [None]:
# To save a machine learning model produced by scikit-learn (sklearn), you can use Python's joblib library,
# which is often preferred for saving scikit-learn models due to its efficiency.

#First, ensure you have the joblib library installed. If you don't have it, you can install it using pip:

# pip install joblib

# Once you have your scikit-learn model trained and ready to save, import joblib:
import joblib


# Save the model as a pickle in a file
# You can use the joblib.dump() function. Provide the model and the file path where you want to save it:
joblib.dump(clf, 'filename.pkl')

# Load the model from the file
# Your scikit-learn model is now saved to the specified file with the ".pkl" extension.
# To load the model at a later time for use, you can use joblib.load():
clf_from_joblib = joblib.load('filename.pkl')

# Use the loaded model to make predictions
clf_from_joblib.predict(X_test)

# This process allows you to save and load scikit-learn models efficiently, preserving their state for future use without the need to retrain them.