# For Data Checking


For Reading Data

In [None]:
df = pd.read_csv(filepath)
df = pd.read_csv(other_path, header=None) '''if no headers are there'''
df = pd.read_csv('auto.csv', names = headers) '''if you want to set your headers while importing data'''

For Setting Values to Pandas Columns or Checking Them

In [None]:
df.columns = headers '''where headers is the variable containing the list of column names'''
print (df.columns) '''to call the columns'''
df.rename(columns={'old name':'new name', 'old name':'new name'}, inplace=True)

To Join a Column or Columns to a Dataframe

In [None]:
df = pd.concat([df, the_column], axis=1)

To Save Datasets

In [1]:
df.to_csv(path) '''path has to be a path in the computer'''
df.to_csv(path, index=False) '''use index = False to avoid saving the row indexes to csv file'''
df.to_csv('clean_df.csv') '''to save dataset to jupyter homepage'''

To Check Dataset Qualities

In [None]:
df.dtypes
df.describe()
df.describe(include = "all") '''to include the NaN columns(strings)'''
df[[' column 1 ',column 2', 'column 3'] ].describe() '''to get statistics for certain columns'''
df.info() '''gets the column data types and also displays the amount of non-null values in columns'''

# For Data Wrangling


To Replace Missing Values from ? to NaN

In [None]:
import numpy as np
df.replace("?", np.nan, inplace = True) '''Note that Numpy first has to be imported'''

To Check Missing Data in Individual Columns

In [None]:
missing_data = df.isnull()
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")    
    
'''True indicates missing values and False indicates non-missing values'''

For Replacing Missing Values With Average

In [None]:
the_average = df["column"].astype("float").mean(axis=0)
df["column"].replace(np.nan, the_average, inplace=True)

'''astype('float') can be removed if the data type doesnt have to be converted to float'''

To Replace with Highest Frequency

In [None]:
df['column'].value_counts() 
df["column"].replace(np.nan, "mostfrequentvalue", inplace=True)

'''value counts shows the unique values. Then the most frequent can be picked to replace the null values'''

To Drop Entire Rows and Reset Index

In [None]:
df.dropna(subset=["column"], axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

'''index is reset because the whole thing gets scrambled after dropping rows'''

Changing Data Types for Columns

In [None]:
df[["column"]] = df[["column"]].astype("float")

'''for a single column'''

df[["column1", "column2"]] = df[["column1", "column2"]].astype("int")

'''for multiple columns'''

Adding Column with Changes to Another Existing Column

In [None]:
df['city-L/100km'] = 235/df["city-mpg"]

Data Normalization

In [None]:
df['column'] = df['column']/df['column'].max()

'''this is the simple feature scaling method'''

Data Binning

In [None]:
bins = np.linspace(min(df["horsepower"]), max(df["horsepower"]), 4)
group_names = ['Low', 'Medium', 'High']
df['horsepower-binned'] = pd.cut(df['horsepower'], bins, labels=group_names, include_lowest=True )

df["horsepower-binned"].value_counts()
'''value counts is optional'''

Get Dummies for a Particular Column

In [None]:
dummy_variable_1 = pd.get_dummies(df["fuel-type"])

'''you can then use the join a column code to join the dummy columns'''

# For Exploratory Data Analysis

Getting Correlation for all Features in the DataFrame

In [None]:
df.corr()

df[['bore','stroke','compression-ratio','horsepower']].corr() '''for certain features'''

To plot Regression for X(feature) and Y(target) variables

In [None]:
sns.regplot(x="engine-size", y="price", data=df)
plt.ylim(0,) '''plt.ylim makes it look better if the data range starts from 0'''

To plot Boxplots for Categorical Variables and Target Variable

In [None]:
sns.boxplot(x="body-style", y="price", data=df)

To get Statistics for Only Categorical Variables

In [None]:
df.describe(include=['object']) 

To get Unique Values for Categorical Columns

In [None]:
df['drive-wheels'].value_counts().to_frame()

'''note that value counts work on Pandas series not dataframe, hence the single bracket'''

To Create Pivot Tables

In [2]:
df_gptest = df[['drive-wheels','body-style','price']]
grouped_test1 = df_gptest.groupby(['drive-wheels','body-style'],as_index=False).mean()

'''this gets the average for the categorical variables'''

grouped_pivot = grouped_test1.pivot(index='drive-wheels',columns='body-style')
grouped_pivot

'''Converts the table to a pivot. Index is for rows and columns is for columns'''


grouped_pivot = grouped_pivot.fillna(0)

'''fill NaN values with 0'''

To Calculate P-Value Between X and Y Variables

In [None]:
from scipy import stats
pearson_coef, p_value = stats.pearsonr(df['wheel-base'], df['price'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value)  

# For Model Development

Importing Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression

Creating a Linear Regression Object and a Model

In [None]:
lm = LinearRegression()
X = df[['highway-mpg']]
Y = df['price']
lm.fit(X,Y)

'''for simple linear regression models'''

Z = df[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']]
lm.fit(Z, df['price'])

'''for multiple linear regression models'''

Predicting Values on a Model

In [None]:
Yhat=lm.predict(X)
Yhat[0:5] 

Getting the Intercept, Slope, and Score

In [None]:
lm.intercept_
lm.coef_
lm.score (x,y)

To Visualize a Multiple Linear Regression Model

In [None]:
import seaborn as sns
%matplotlib inline 


plt.figure(figsize=(width, height))


ax1 = sns.distplot(df['price'], hist=False, color="r", label="Actual Value")
sns.distplot(Y_hat, hist=False, color="b", label="Fitted Values" , ax=ax1)


plt.title('Actual vs Fitted Values for Price')
plt.xlabel('Price (in dollars)')
plt.ylabel('Proportion of Cars')

plt.show()
plt.close()


'''to plot distribution plot for MLR'''

To Create a Polynomial Regression Model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

Input=[('scale',StandardScaler()), ('polynomial', PolynomialFeatures(include_bias=False)), ('model',LinearRegression())]
pipe=Pipeline(Input)

pipe.fit(Z,y)

ypipe=pipe.predict(Z)
ypipe[0:5]

To Visualize a Polynomial Regressional Model

In [None]:
def PlotPolly(model, independent_variable, dependent_variabble, Name):
    x_new = np.linspace(15, 55, 100)
    y_new = model(x_new)

    plt.plot(independent_variable, dependent_variabble, '.', x_new, y_new, '-')
    plt.title('Polynomial Fit with Matplotlib for Price ~ Length')
    ax = plt.gca()
    ax.set_facecolor((0.898, 0.898, 0.898))
    fig = plt.gcf()
    plt.xlabel(Name)
    plt.ylabel('Price of Cars')

    plt.show()
    plt.close()
    
    '''Note that the name argument in the function is the label for the x axis. The model is the polynomial model variable'''

To Get Mean Squared Error for Linear Regression Models

In [None]:
from sklearn.metrics import mean_squared_error

Yhat=lm.predict(X)
mse = mean_squared_error(df['price'], Yhat)
print('The mean square error of price and predicted value is: ', mse)

# For Model Refinement

For Splitting Into Train and Test Data

In [None]:
from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.10, random_state=1)

from sklearn.linear_model import LinearRegression

lre=LinearRegression()
lre.fit(x_train[['horsepower']], y_train)
lre.score(x_test[['horsepower']], y_test)

'''model is saved to the lre object and used on the test data'''

   ##         To understand this new Polynomial Regression Code better, you need to realize that the train and test features are first transformed to a Polynomial dimension. Then the Linear Regression Object is used to fit the transformed features to create a model
             

Create Polynomial Regression Model Using xTrain and xTest Data

In [None]:
pr = PolynomialFeatures(degree=5)
x_train_pr = pr.fit_transform(x_train[['horsepower']])
x_test_pr = pr.fit_transform(x_test[['horsepower']])

poly = LinearRegression()
poly.fit(x_train_pr, y_train)
poly.score(x_train_pr, y_train)

'''a great idea is to first see plot of best polynomial order before creating the model'''

To See Plot of Best Polynomial Order with R^2

In [None]:
Rsqu_test = []

order = [1, 2, 3, 4]
for n in order:
    pr = PolynomialFeatures(degree=n)
    
    x_train_pr = pr.fit_transform(x_train[['horsepower']])
    
    x_test_pr = pr.fit_transform(x_test[['horsepower']])    
    
    lr.fit(x_train_pr, y_train)
    
    Rsqu_test.append(lr.score(x_test_pr, y_test))

plt.plot(order, Rsqu_test)
plt.xlabel('order')
plt.ylabel('R^2')
plt.title('R^2 Using Test Data')
plt.text(3, 0.75, 'Maximum R^2 ')    

To Calculate Cross Validation Score

In [None]:
from sklearn.model_selection import cross_val_score

Rcross = cross_val_score(lre, x_data[['horsepower']], y_data, cv=4)
print("The mean of the folds are", Rcross.mean(), "and the standard deviation is" , Rcross.std())

To Run a Grid Search and Get a Model

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

parameters1= [{'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000, 100000]}]

RR=Ridge()
RR

Grid1 = GridSearchCV(RR, parameters1,cv=4)
Grid1.fit(x_data[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']], y_data)

BestRR=Grid1.best_estimator_
BestRR.score(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']], y_test)