# Joins and Merges

In [1]:
import pandas as pd

df1 = pd.DataFrame({
    'X' : [0,2,4,6,8,10,12,14,16,18],
    1 : [100,101,102,103,104,105,106,107,108,109],
    2 : [200,201,202,203,204,205,206,207,208,209]
})

df2 = pd.DataFrame({
    'X' : [0,3,6,9,12,15,18,21,24,27],
    1 : [100,101,102,103,104,105,106,107,108,109],
    2 : [200,201,202,203,204,205,206,207,208,209]
})

display(df1)
display(df2)

Unnamed: 0,X,1,2
0,0,100,200
1,2,101,201
2,4,102,202
3,6,103,203
4,8,104,204
5,10,105,205
6,12,106,206
7,14,107,207
8,16,108,208
9,18,109,209


Unnamed: 0,X,1,2
0,0,100,200
1,3,101,201
2,6,102,202
3,9,103,203
4,12,104,204
5,15,105,205
6,18,106,206
7,21,107,207
8,24,108,208
9,27,109,209


In [2]:
outer = df1.merge(df2 ,how='outer', on='X', suffixes=['_from1', '_from2'])
inner = df1.merge(df2 ,how='inner', on='X', suffixes=['_from1', '_from2'])
right = df1.merge(df2 ,how='right', on='X', suffixes=['_from1', '_from2'])
left = df1.merge(df2 ,how='left', on='X', suffixes=['_from1', '_from2'])

display(outer)
display(inner)
display(right)
display(left)

Unnamed: 0,X,1_from1,2_from1,1_from2,2_from2
0,0,100.0,200.0,100.0,200.0
1,2,101.0,201.0,,
2,4,102.0,202.0,,
3,6,103.0,203.0,102.0,202.0
4,8,104.0,204.0,,
5,10,105.0,205.0,,
6,12,106.0,206.0,104.0,204.0
7,14,107.0,207.0,,
8,16,108.0,208.0,,
9,18,109.0,209.0,106.0,206.0


Unnamed: 0,X,1_from1,2_from1,1_from2,2_from2
0,0,100,200,100,200
1,6,103,203,102,202
2,12,106,206,104,204
3,18,109,209,106,206


Unnamed: 0,X,1_from1,2_from1,1_from2,2_from2
0,0,100.0,200.0,100,200
1,3,,,101,201
2,6,103.0,203.0,102,202
3,9,,,103,203
4,12,106.0,206.0,104,204
5,15,,,105,205
6,18,109.0,209.0,106,206
7,21,,,107,207
8,24,,,108,208
9,27,,,109,209


Unnamed: 0,X,1_from1,2_from1,1_from2,2_from2
0,0,100,200,100.0,200.0
1,2,101,201,,
2,4,102,202,,
3,6,103,203,102.0,202.0
4,8,104,204,,
5,10,105,205,,
6,12,106,206,104.0,204.0
7,14,107,207,,
8,16,108,208,,
9,18,109,209,106.0,206.0


# Pivot Tables
- Going from long -> wide : pivot table
- Going from wide -> long : melting

In [3]:
weather = pd.DataFrame({
    'Time' : [800,830,900,930,1000,1030,1100,1130,1200],
    'Temp' : [43,45,52,55,57,64,67,75,78],
    'Wind' : [2,5,4,3,2,5,3,3,2],
    'RH' : [45,25,95,63,35,48,82,51,24]
})

display(weather)

Unnamed: 0,Time,Temp,Wind,RH
0,800,43,2,45
1,830,45,5,25
2,900,52,4,95
3,930,55,3,63
4,1000,57,2,35
5,1030,64,5,48
6,1100,67,3,82
7,1130,75,3,51
8,1200,78,2,24


In [4]:
pivot = pd.pivot_table(weather, index='Temp', columns='Wind', values='RH', aggfunc='mean')
pivot

Wind,2,3,4,5
Temp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
43,45.0,,,
45,,,,25.0
52,,,95.0,
55,,63.0,,
57,35.0,,,
64,,,,48.0
67,,82.0,,
75,,51.0,,
78,24.0,,,


In [5]:
pd.melt(pivot)

Unnamed: 0,Wind,value
0,2,45.0
1,2,
2,2,
3,2,
4,2,35.0
5,2,
6,2,
7,2,
8,2,24.0
9,3,


# Supervised
- Discrete
    - Classification: 
        - Models:
            - Naive Bayes
            - Logistic Regression
        - Evaluation:
            - Prediction comes first in ordered pair and then actual second...
            - TP = ++, FP = +-, FN = -+, TN = --
            - Accuracy: $\frac{TP + TN}{TP + FP + TN + FN}$ TL,BR of confusion matrix
            - Precision: $\frac{TP}{TP + FP}$ TL,BL of confusion matrix
            - Recall: $\frac{TP}{TP + FN}$  TL,Tr fo confusion matrix
            - F1 score $\frac{2 * precision * recall}{precision + recall}$ Harmonic mean of precision & recall 
- Continuous
    - Regression:
        - Models:
            - Linear Regression
        - Evaluation:
            - Mean Absolute Error: $\frac{1}{n}\sum{|{y_i - \hat{y}_i}|}$
            - Sum Squared Error: $\sum{|{y_i - \hat{y}_i}|}^2$
            - Mean Squared Error: $\frac{1}{n}\sum{|{y_i - \hat{y}_i}|}^2$
            - Root Mean Squared Error: $\sqrt{\frac{1}{n}\sum{|{y_i - \hat{y}_i}|}^2}$
            - Root Mean Squared Log Error: $\sqrt{\frac{1}{n} Σ [log(1 + y_i) - log(1 + ŷ_i)]^2}$ or $\sqrt{\frac{1}{n} Σ [log(\frac{1 + y_1}{1 + ŷ_1})]^2}$
                - uses the natural log "np.log"

In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

def accuracy(x_predict, x_true):
    sum = 0

    for i in range(len(x_predict)):
        if x_predict[i] == x_true[i]:
            sum += 1
    return sum/len(x_predict)

def precision(x_predict, x_true, trueVal):
    sum = 0
    total = 0

    for i in range(len(x_predict)):
        if x_predict[i] == trueVal:
            total += 1
        if (x_predict[i] == x_true[i] and x_predict[i] == trueVal):
            sum += 1
       
    return sum/total

def recall(x_predict, x_true, trueVal):
    sum = 0
    total = 0

    for i in range(len(x_predict)):
        if (x_true[i] == trueVal and x_predict[i] != trueVal):
            total += 1
        if (x_predict[i] == x_true[i] and x_predict[i] == trueVal):
            sum += 1
    
    return sum/(sum + total)

def f1score(x_predict, x_true, trueVal):
    p = precision(x_predict, x_true, trueVal)
    r = recall(x_predict, x_true, trueVal)

    return (2 * p * r) / (p + r)

def mae(actual, calculated):
    sum = 0

    for i in range(len(actual)):
        sum += abs(calculated[i] - actual[i])

    return sum/len(actual)

def mse(actual, calculated):
    sum = 0

    for i in range(len(actual)):
        sum += pow((calculated[i] - actual[i]), 2)

    return sum/len(actual)

def sse(actual, calculated):
    sum = 0

    for i in range(len(actual)):
        sum += pow((calculated[i] - actual[i]), 2)

    return sum

def rmse(actual, calculated):
    sum = 0

    for i in range(len(actual)):
        sum += pow((calculated[i] - actual[i]), 2)

    return np.sqrt(sum/len(actual))

def rmsle(actual, calculated):
    sum = 0

    for i in range(len(actual)):
        sum += np.log(((1 + calculated[i]) / (1 + actual[i])))

    sum = sum / len(actual)
    return np.sqrt(pow(sum/len(actual), 2))

In [11]:
predicted = np.array([120.4, 121.3, 120.9, 122.0])
actual = np.array([123.2, 119.4, 121.7, 127.4])

print(mae(predicted, actual))
print(sse(predicted, actual))
print(mse(predicted, actual))
print(rmse(predicted, actual))
print(rmsle(predicted, actual))

2.724999999999998
41.25000000000001
10.312500000000002
3.2113081446662823
0.0035407582605411145
