In [1]:
import pandas as pd
import numpy as np

# Titanic Data

Okay, let's read the "Titanic" data.

In [2]:
titanic_data = pd.read_csv("https://raw.githubusercontent.com/Geoyi/Cleaning-Titanic-Data/master/titanic_original.csv")
titanic_data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


Let's see the number of missing values for each column.

In [3]:
titanic_data.isnull().sum() 

pclass          1
survived        1
name            1
sex             1
age           264
sibsp           1
parch           1
ticket          1
fare            2
cabin        1015
embarked        3
boat          824
body         1189
home.dest     565
dtype: int64

# Iris Data

Okay, let's read the "Iris" data.

In [4]:
iris_data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header = None)
iris_data.columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class_name"]
iris_data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class_name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


Let's see the shape of the data. The first number is number of observations, the second number is the number of features.

In [5]:
iris_data.shape

(150, 5)

Let's see all unique class names.

In [6]:
iris_data.class_name.unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

Let's take all Iris flowers whose sepal length is between 4.5 and 10 (inclusive).
After that, we need to take the average petal length of these flowers and round the answer to 2 decimals

In [7]:
filtered = iris_data[(iris_data.sepal_length >= 4.5) & (iris_data.sepal_length <= 10)]
average_petal_length = np.mean(filtered).petal_length
print(round(average_petal_length, 2))

3.83


# Wine Quality Data

Okay, let's read the 2 datasets for the "Wine Quality" data.

In [8]:
red_wine_data = pd.read_csv("data/winequality-red.csv", sep = ";")
red_wine_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [9]:
white_wine_data = pd.read_csv("data/winequality-white.csv", sep = ";")
white_wine_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


Let's see which kinds of wine seem to be preferred (i.e. have greater average quality)

In [10]:
print(np.average(red_wine_data.quality))
print(np.average(white_wine_data.quality))

5.63602251407
5.87790935076


Let's check what is the correlation of alcohol content to quality

In [11]:
red_wine_correlation = red_wine_data.quality.corr(red_wine_data.alcohol)
print(round(red_wine_correlation, 2))
white_wine_correlation = white_wine_data.quality.corr(white_wine_data.alcohol)
print(round(white_wine_correlation, 2))

0.48
0.44
