In [None]:
# Load numpy and pandas for data manipulation
import numpy as np
import pandas as pd

# Load statsmodels as alias 'sm'
import statsmodels.api as sm

In [None]:
# Load the dataset into a pandas data frame - first column (year) used as row labels
df = pd.read_csv('http://vincentarelbundock.github.io/Rdatasets/csv/datasets/longley.csv', index_col = 0)
df.head(3)

We will use the variable Total Derived Employment ('Employed') as our response y and Gross National Product ('GNP') as our predictor X.

In [None]:
# Predictor variable
X = df.GNP
X.head(3)

In [None]:
# Response variable
y = df.Employed

In [None]:
# Add a constant term to the predictor
X = sm.add_constant(X)

In [None]:
# Perform the regression
est = sm.OLS(y,X)

In [None]:
# Fit the model
est = est.fit()

In [None]:
est.summary()

In [None]:
# Coefficients of fit
est.params

In [None]:
# Make sure that graphics appear inline in the iPython notebook
%pylab inline

# We pick 100 hundred points equally spaced from the min to the max
X_prime = np.linspace(X.GNP.min(), X.GNP.max(), 100)[:, np.newaxis]
X_prime = sm.add_constant(X_prime)  # add constant as we did before

# Now we calculate the predicted values
y_hat = est.predict(X_prime)

plt.scatter(X.GNP, y, alpha=0.3)  # Plot the raw data
plt.xlabel("Gross National Product")
plt.ylabel("Total Employment")
plt.plot(X_prime[:, 1], y_hat, 'r', alpha=0.9)  # Add the regression line, colored in red

In [None]:
# import formula api as alias smf
import statsmodels.formula.api as smf

# Fit the no-intercept model
est_no_int = smf.ols(formula='Employed ~ GNP - 1', data=df).fit()

In [None]:
est_no_int.summary()

**Linear regression w/ scikit-learn**

In [73]:
import numpy as np
import pandas as pd
from sklearn import linear_model

In [74]:
df = pd.read_csv('http://vincentarelbundock.github.io/Rdatasets/csv/datasets/longley.csv')

In [97]:
# Predictor variable
X = df['GNP']
X = X.reshape(len(X), 1)

In [98]:
# Response variable
y = df['Employed']
y = y.reshape(len(y), 1)

In [99]:
# Instantiate a linear regression object
lm = linear_model.LinearRegression()

In [100]:
# Train the model using the training sets
lm.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [109]:
# Print coefficient and intercept
print 'Estimated intercept coefficient:', lm.intercept_

Estimated intercept coefficient: [ 51.84358978]


In [108]:
print 'Number of coefficients:', len(lm.coef_)

Number of coefficients: 1


In [None]:
# The mean square error
print("Residual sum of squares: %.2f"
      % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2))

In [None]:
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(diabetes_X_test, diabetes_y_test))

In [None]:
# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
plt.plot(diabetes_X_test, regr.predict(diabetes_X_test), color='blue',
         linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

In [None]:
def basic_linear_regression:
    # Basic computations to save a little time
    length = len(X)
    sum_x = sum(X)
    sum_y = sum(y)
    
    # Σx^2, and Σxy respectively.
    sum_x_squared = sum(map(lambda a: a * a, X))
    sum_of_products = sum([X[i] * y[i] for i in range(length)])

    # Magic formulae!  
    a = (sum_of_products - (sum_x * sum_y) / length) / (sum_x_squared - ((sum_x ** 2) / length))
    b = (sum_y - a * sum_x) / length
    return a, b

In [None]:
prac_list = [10, 15, 20]

In [None]:
prac_avg = np.mean(prac_list)
prac_std = np.std(prac_list)

In [None]:
new_list = []

for i in prac_list:
    this_one = (i - prac_avg)/prac_std
    new_list.append(this_one)

In [None]:
new_list

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
features_norm_2 = scaler.fit_transform(prac_list)

In [None]:
features_norm_2