# Data Retrieval

In [14]:
import pandas as pd
# turn of warning messages
pd.options.mode.chained_assignment = None  # default='warn'

# get data
df = pd.read_csv('student_records.csv')
df

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Henry,A,Y,90,85,Yes
1,John,C,N,85,51,Yes
2,David,F,N,10,17,No
3,Holmes,B,Y,75,71,No
4,Marvin,E,N,20,30,No
5,Simon,A,Y,92,79,Yes
6,Robert,B,Y,60,59,No
7,Trent,C,Y,75,33,No


# Data Preparation

## Feature extraction and engineering

In [2]:
# get features and corresponding outcomes
feature_names = ['OverallGrade', 'Obedient', 'ResearchScore', 'ProjectScore']
training_features = df[feature_names]

outcome_name = ['Recommend']
outcome_labels = df[outcome_name]

In [3]:
# view features
training_features

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,90,85
1,C,N,85,51
2,F,N,10,17
3,B,Y,75,71
4,E,N,20,30
5,A,Y,92,79
6,B,Y,60,59
7,C,Y,75,33


In [4]:
# view outcome labels
outcome_labels

Unnamed: 0,Recommend
0,Yes
1,Yes
2,No
3,No
4,No
5,Yes
6,No
7,No


In [5]:
# list down features based on type
numeric_feature_names = ['ResearchScore', 'ProjectScore']
categoricial_feature_names = ['OverallGrade', 'Obedient']

## Numeric feature scaling

In [6]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

# fit scaler on numeric features
ss.fit(training_features[numeric_feature_names])

# scale numeric features now
training_features[numeric_feature_names] = ss.transform(training_features[numeric_feature_names])

# view updated featureset
training_features

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,0.899583,1.37665
1,C,N,0.730648,-0.091777
2,F,N,-1.80339,-1.560203
3,B,Y,0.392776,0.772004
4,E,N,-1.465519,-0.998746
5,A,Y,0.967158,1.117516
6,B,Y,-0.114032,0.253735
7,C,Y,0.392776,-0.869179


## Engineering categorical features

In [7]:
training_features = pd.get_dummies(training_features, columns=categoricial_feature_names)
# view newly engineering features
training_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_B,OverallGrade_C,OverallGrade_E,OverallGrade_F,Obedient_N,Obedient_Y
0,0.899583,1.37665,1,0,0,0,0,0,1
1,0.730648,-0.091777,0,0,1,0,0,1,0
2,-1.80339,-1.560203,0,0,0,0,1,1,0
3,0.392776,0.772004,0,1,0,0,0,0,1
4,-1.465519,-0.998746,0,0,0,1,0,1,0
5,0.967158,1.117516,1,0,0,0,0,0,1
6,-0.114032,0.253735,0,1,0,0,0,0,1
7,0.392776,-0.869179,0,0,1,0,0,0,1


In [15]:
# get list of new categorical features
categorical_engineered_features = list(set(training_features.columns) - set(numeric_feature_names))

In [16]:
import matplotlib.pyplot as plt
from nltk.parse.stanford import StanfordParser
import numpy as np

%matplotlib inline

In [17]:
np.__version__

'1.16.4'

## Math basics


## Vectors

In [19]:
## vectors

x = [1, 2, 3, 4, 5]
x

[1, 2, 3, 4, 5]

In [20]:
# using numpy
import numpy as np
x = np.array([1, 2, 3, 4, 5])

print(x)
print(type(x))

[1 2 3 4 5]
<class 'numpy.ndarray'>


## Matrics

In [21]:
## matrices

m = np.array([[1, 5, 2],
              [4, 7, 4],
              [2, 0, 9]])

# view matrix
print(m)

# view dimensions
print(m.shape)

[[1 5 2]
 [4 7 4]
 [2 0 9]]
(3, 3)


## Matrix transpose

In [22]:
# matrix transpose
print('Matrix Transpose:\n', m.transpose(), '\n')

Matrix Transpose:
 [[1 4 2]
 [5 7 0]
 [2 4 9]] 



# Matrix determinant

In [23]:
# matrix determinant
print ('Matrix Determinant:', np.linalg.det(m), '\n')

Matrix Determinant: -105.00000000000006 



# Matrix Inverse

In [24]:
# matrix inverse
m_inv = np.linalg.inv(m)
print ('Matrix inverse:\n', m_inv, '\n')

Matrix inverse:
 [[-0.6         0.42857143 -0.05714286]
 [ 0.26666667 -0.04761905 -0.03809524]
 [ 0.13333333 -0.0952381   0.12380952]] 



# Identity Matrix

In [25]:
# identity matrix (result of matrix x matrix_inverse)
iden_m =  np.dot(m, m_inv)
iden_m = np.round(np.abs(iden_m), 0)
print ('Product of matrix and its inverse:\n', iden_m)

Product of matrix and its inverse:
 [[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


# Eigen Decomposition

In [26]:
# eigendecomposition
m = np.array([[1, 5, 2],
              [4, 7, 4],
              [2, 0, 9]])

eigen_vals, eigen_vecs = np.linalg.eig(m)

print('Eigen Values:', eigen_vals, '\n')
print('Eigen Vectors:\n', eigen_vecs)

Eigen Values: [-1.32455532 11.32455532  7.        ] 

Eigen Vectors:
 [[-0.91761521  0.46120352 -0.46829291]
 [ 0.35550789  0.79362022 -0.74926865]
 [ 0.17775394  0.39681011  0.46829291]]


# SVD

In [27]:
# SVD
m = np.array([[1, 5, 2],
              [4, 7, 4],
              [2, 0, 9]])

U, S, VT = np.linalg.svd(m)

print ('Getting SVD outputs:-\n')
print('U:\n', U, '\n')
print('S:\n', S, '\n')
print('VT:\n', VT, '\n')

Getting SVD outputs:-

U:
 [[ 0.3831556  -0.39279153  0.83600634]
 [ 0.68811254 -0.48239977 -0.54202545]
 [ 0.61619228  0.78294653  0.0854506 ]] 

S:
 [12.10668383  6.91783499  1.25370079] 

VT:
 [[ 0.36079164  0.55610321  0.74871798]
 [-0.10935467 -0.7720271   0.62611158]
 [-0.92621323  0.30777163  0.21772844]] 



# Descriptive Statistics

In [28]:
# descriptive statistics
import scipy as sp
import numpy as np

# get data
nums = np.random.randint(1,20, size=(1,15))[0]
print('Data: ', nums)

Data:  [ 9 19  3 17 17  1 18 12 15  1 14  3  6 11 19]


In [29]:
# get descriptive stats
print ('Mean:', sp.mean(nums))
print ('Median:', sp.median(nums))
print ('Mode:', sp.stats.mode(nums))
print ('Standard Deviation:', sp.std(nums))
print ('Variance:', sp.var(nums))
print ('Skew:', sp.stats.skew(nums))
print ('Kurtosis:', sp.stats.kurtosis(nums))

Mean: 11.0
Median: 12.0
Mode: ModeResult(mode=array([1]), count=array([2]))
Standard Deviation: 6.491019437140312
Variance: 42.13333333333333
Skew: -0.30860553975548427
Kurtosis: -1.4093594776478129
