# The `pandas-linear-regression` library

This library uses the `pandas` to calculate the follow:

1. The slope of the regression line
2. The intercept of the regression line
3. A prediction function

In [137]:
import pandas as pd
df_train = pd.read_csv("https://raw.githubusercontent.com/ThomasJewson/datasets/master/ElectricTrainUsage1888/electrictrain.csv")
df_train.head()

Unnamed: 0,Number of cars operating,Miles per week,Passengers per week
0,72,2632,18764
1,48,1211,6688
2,77,2604,16504
3,91,4039,22944
4,94,5047,25063


In [32]:
df_train.corr()

Unnamed: 0,Number of cars operating,Miles per week,Passengers per week
Number of cars operating,1.0,0.957434,0.959063
Miles per week,0.957434,1.0,0.94185
Passengers per week,0.959063,0.94185,1.0


#### The slope of the regression line:

${\large Slope=m=r*\frac{\text{Standard Deviation of Y}}{\text{Standard Deviation of X}}=\frac{r \sigma_{y}}{\sigma_{x}}}$

In [90]:
def slope_single(df,x,y):
    return df.corr().loc[x,y] * df[y].std() / df[x].std()

In [92]:
slope_single(df_train,"Number of cars operating","Miles per week")

89.96210129797845

In [95]:
def slope_x(df,x):
    out_df = pd.DataFrame(columns=[x])
    for z in range(len(df.corr())):
        part_df = pd.DataFrame(
            data = [slope_single(df,x,df.corr().columns[z])],
            columns = [x],
            index = [df.corr().columns[z]]   
        )
        out_df = out_df.append(part_df)
    return out_df

In [96]:
slope_x(df_train,"Number of cars operating")

Unnamed: 0,Number of cars operating
Number of cars operating,1.0
Miles per week,89.962101
Passengers per week,375.676968


In [141]:
def slope(df):
    """
    Outputs the regression line slope
    
    
    The x-axis of the returned DataFrame is the x-axis of the line. 
    The y-axis of the returned DataFrame is the y-axis of the line. 
    """
    out_df = slope_x(df,df.corr().columns[0])
    for z in range(1,len(df.corr())):
        out_df = out_df.join(slope_x(df,df.corr().columns[z]))
    return out_df

In [142]:
slope(df_train)

Unnamed: 0,Number of cars operating,Miles per week,Passengers per week
Number of cars operating,1.0,0.01019,0.002448
Miles per week,89.962101,1.0,0.225925
Passengers per week,375.676968,3.926438,1.0


#### The intercept of the regression line:

${\large Intercept=\text{Mean of Y}-slope*\text{Mean of X}=\mu_{y}-m\mu_{x}}$

In [173]:
def intercept_single(df,x,y):
    return df[y].mean() - (slope_single(df,x,y)*df[x].mean())

In [144]:
intercept_single(df_train,"Number of cars operating","Miles per week")

-2740.794013023309

In [175]:
def intercept(df):
    out_df = slope(df)
    for x_ax in range(len(df.corr())):
        for y_ax in range(len(df.corr())):
            out_df.loc[df.corr().index[y_ax],df.corr().index[x_ax]] = intercept_single(df,df.corr().index[x_ax],df.corr().index[y_ax])
    return out_df

In [181]:
intercept(df_train)

Unnamed: 0,Number of cars operating,Miles per week,Passengers per week
Number of cars operating,0.0,38.713361,36.074952
Miles per week,-2740.794013,0.0,288.592927
Passengers per week,-10493.983521,3173.251182,0.0


#### The prediction with the regression line:

In [196]:
def predict(df,x,z):
    """"""
    out_df = slope(df) * z + intercept(df)
    return out_df[x]

In [201]:
predict(df_train,"Number of cars operating",100)

Number of cars operating      100.000000
Miles per week               6255.416117
Passengers per week         27073.713289
Name: Number of cars operating, dtype: float64