## Linear Regression using Python (basics) 

In [21]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

### Read file 
df = pd.read_csv('https://gist.githubusercontent.com/omarish/5687264/raw/7e5c814ce6ef33e25d5259c1fe79463c190800d9/mpg.csv')
### checking the data type of the file 
df.dtypes

# Data cleaning step 
print(df[pd.to_numeric(df['horsepower'],errors='coerce').isnull()])

# Cleaning from the particular column
df['horsepower']=pd.to_numeric(df['horsepower'],errors='coerce')
cols=df.columns
#  Making all the unwanted value as NaN
df[cols]=df[cols].apply(pd.to_numeric,errors="coerce")
df.applymap(np.isreal)
df.head()

# Dropping all the unnecessary columns
df=df.drop(['name','origin','model_year'],axis=1)
df=df.replace('?',np.nan)
df=df.dropna()

# Seperating the dependent variable (y) and independent variable(X)
x=df.drop('mpg',axis=1)
y=df[['mpg']]

# Drop all the nan valus from the dataset
df=df.dropna()

# Making training and testing dataset 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)

# linear Regression and finding the best fit line for only one column
reg=LinearRegression()
reg.fit(x_train[['horsepower']],y_train)

#Predicting the value
y_predicted=reg.predict(x_test[['horsepower']])

# Evalution matrices
print("Mean Squared error: %.2f" % mean_squared_error(y_test,y_predicted))
print('R**2: %.2f'%r2_score(y_test,y_predicted))
# Finding the best fit line for more tha none column
reg.fit(x_train[['horsepower','weight','cylinders']],y_train)
y_predicted=reg.predict(x_test[['horsepower','weight','cylinders']])

print("Mean squared error: %.2f" % mean_squared_error(y_test,y_predicted))
print('R**2,%.2f' % r2_score(y_test,y_predicted))

      mpg  cylinders  displacement horsepower  weight  acceleration  \
32   25.0          4          98.0          ?    2046          19.0   
126  21.0          6         200.0          ?    2875          17.0   
330  40.9          4          85.0          ?    1835          17.3   
336  23.6          4         140.0          ?    2905          14.3   
354  34.5          4         100.0          ?    2320          15.8   
374  23.0          4         151.0          ?    3035          20.5   

     model_year  origin                  name  
32           71       1            ford pinto  
126          74       1         ford maverick  
330          80       2  renault lecar deluxe  
336          80       1    ford mustang cobra  
354          81       2           renault 18i  
374          82       1        amc concord dl  
Mean Squared error: 28.66
R**2: 0.59
Mean squared error: 19.12
R**2,0.72
