In [36]:
import pandas as pd
import pipeline_utilities as p_util
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

In [74]:
# Create a list of DataFrames by using a list comprehension with read_csv
df1 = pd.read_csv(f'https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_3/datasets/news-popularity.csv') 
df2 = pd.read_csv(f'https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_3/datasets/beijing-pm2-5.csv')
df3 = pd.read_csv(f'https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_3/datasets/garments-worker-productivity.csv')
df2.head()

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
0,1,2010,1,1,0,,-21,-11.0,1021.0,NW,1.79,0,0
1,2,2010,1,1,1,,-21,-12.0,1020.0,NW,4.92,0,0
2,3,2010,1,1,2,,-21,-11.0,1019.0,NW,6.71,0,0
3,4,2010,1,1,3,,-21,-14.0,1019.0,NW,9.84,0,0
4,5,2010,1,1,4,,-20,-12.0,1018.0,NW,12.97,0,0


In [73]:
def preprocess_prod_data1(prod_df):
    """
    Written for productivity data; will drop null values and 
    split into training and testing sets. Uses SentimentTitle
    as the target column.
    """
    df = prod_df.dropna()
    X = df.drop(columns=['SentimentTitle','Title','Headline','Source','Topic','IDLink','PublishDate'])

    y = df['SentimentTitle'].values.reshape(-1, 1)
    return train_test_split(X, y)

def preprocess_prod_data2(prod_df):
    """
    Written for productivity data; will drop null values and 
    split into training and testing sets. Uses SentimentTitle
    as the target column.
    """
    df = prod_df.dropna()
    X = df.drop(columns=['TEMP','cbwd','pm2.5'])

    y = df['TEMP'].values.reshape(-1, 1)
    return train_test_split(X, y)

def preprocess_prod_data3(prod_df):
    """
    Written for productivity data; will drop null values and 
    split into training and testing sets. Uses SentimentTitle
    as the target column.
    """
    df = prod_df.dropna()
    X = df.drop(columns=['actual_productivity','department','quarter','date','day'])

    y = df['actual_productivity'].values.reshape(-1, 1)
    return train_test_split(X, y)



In [63]:
def r2_adj1(x, y, model):
    """
    Calculates adjusted r-squared values given an X variable, 
    predicted y values, and the model used for the predictions.
    """
    r2 = model.score(x,y)
    n = x.shape[0]
    p = y.shape[1]
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

def r2_adj2(x, y, model):
    """
    Calculates adjusted r-squared values given an X variable, 
    predicted y values, and the model used for the predictions.
    """
    r2 = model.score(x,y)
    n = x.shape[0]
    p = y.shape[1]
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

def r2_adj3(x, y, model):
    """
    Calculates adjusted r-squared values given an X variable, 
    predicted y values, and the model used for the predictions.
    """
    r2 = model.score(x,y)
    n = x.shape[0]
    p = y.shape[1]
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)


In [65]:
def prod_model_generator1(prod_df):
    """
    Defines a series of steps that will preprocess data,
    split data, and train a model for predicting actual productivity
    using linear regression. It will return the trained model
    and print the mean squared error, r-squared, and adjusted
    r-squared scores.
    """
    # Create a list of steps for a pipeline that will one hot encode and scale data
    # Each step should be a tuple with a name and a function
    steps = [("Scale", StandardScaler(with_mean=False)), 
             ("Linear Regression", LinearRegression())] 

    # Create a pipeline object
    pipeline = Pipeline(steps)

    # Apply the preprocess_rent_data step
    X_train, X_test, y_train, y_test = preprocess_prod_data1(prod_df)

    # Fit the pipeline
    pipeline.fit(X_train, y_train)

    # Use the pipeline to make predictions
    y_pred = pipeline.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2_value = r2_score(y_test, y_pred)
    r2_adj_value = r2_adj1(X_test, y_test, pipeline)

    # Print out the MSE, r-squared, and adjusted r-squared values
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2_value}")
    print(f"Adjusted R-squared: {r2_adj_value}")
    if r2_adj_value < 0.4:
        print("WARNING: LOW ADJUSTED R-SQUARED VALUE")

    # Return the trained model
    return pipeline

def prod_model_generator2(prod_df):
    """
    Defines a series of steps that will preprocess data,
    split data, and train a model for predicting actual productivity
    using linear regression. It will return the trained model
    and print the mean squared error, r-squared, and adjusted
    r-squared scores.
    """
    # Create a list of steps for a pipeline that will one hot encode and scale data
    # Each step should be a tuple with a name and a function
    steps = [("Scale", StandardScaler(with_mean=False)), 
             ("Linear Regression", LinearRegression())] 

    # Create a pipeline object
    pipeline = Pipeline(steps)

    # Apply the preprocess_rent_data step
    X_train, X_test, y_train, y_test = preprocess_prod_data2(prod_df)

    # Fit the pipeline
    pipeline.fit(X_train, y_train)

    # Use the pipeline to make predictions
    y_pred = pipeline.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2_value = r2_score(y_test, y_pred)
    r2_adj_value = r2_adj2(X_test, y_test, pipeline)

    # Print out the MSE, r-squared, and adjusted r-squared values
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2_value}")
    print(f"Adjusted R-squared: {r2_adj_value}")
    if r2_adj_value < 0.4:
        print("WARNING: LOW ADJUSTED R-SQUARED VALUE")

    # Return the trained model
    return pipeline

def prod_model_generator3(prod_df):
    """
    Defines a series of steps that will preprocess data,
    split data, and train a model for predicting actual productivity
    using linear regression. It will return the trained model
    and print the mean squared error, r-squared, and adjusted
    r-squared scores.
    """
    # Create a list of steps for a pipeline that will one hot encode and scale data
    # Each step should be a tuple with a name and a function
    steps = [("Scale", StandardScaler(with_mean=False)), 
             ("Linear Regression", LinearRegression())] 

    # Create a pipeline object
    pipeline = Pipeline(steps)

    # Apply the preprocess_rent_data step
    X_train, X_test, y_train, y_test = preprocess_prod_data3(prod_df)

    # Fit the pipeline
    pipeline.fit(X_train, y_train)

    # Use the pipeline to make predictions
    y_pred = pipeline.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2_value = r2_score(y_test, y_pred)
    r2_adj_value = r2_adj3(X_test, y_test, pipeline)

    # Print out the MSE, r-squared, and adjusted r-squared values
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2_value}")
    print(f"Adjusted R-squared: {r2_adj_value}")
    if r2_adj_value < 0.4:
        print("WARNING: LOW ADJUSTED R-SQUARED VALUE")

    # Return the trained model
    return pipeline

In [78]:
# Create a dictionary with the models
model_dict1 = {}
print(df1)
model_dict1[f"{df1.iloc[0,3]}_model"] = prod_model_generator1(df1)
print("----------")




         IDLink                                              Title  \
0       99248.0   Obama Lays Wreath at Arlington National Cemetery   
1       10423.0        A Look at the Health of the Chinese Economy   
2       18828.0   Nouriel Roubini: Global Economy Not Back to 2008   
3       27788.0                          Finland GDP Expands In Q4   
4       27789.0  Tourism, govt spending buoys Thai economy in J...   
...         ...                                                ...   
93234   61851.0  Stocks rise as investors key in on US economy ...   
93235   61865.0  Russian PM proposes to use conservative and to...   
93236  104793.0  Palestinian Government Uses Foreign Aid To Pay...   
93237  104794.0  Palestine Youth Orchestra prepares for first U...   
93238   61870.0  Sausalito businesswoman wins $10000 in Microso...   

                                                Headline  \
0      Obama Lays Wreath at Arlington National Cemete...   
1      Tim Haywood, investment director

In [76]:
model_dict2 = {}
print(df2)
model_dict2[f"{df2.iloc[0,3]}_model"] = prod_model_generator2(df2)
print("----------")


          No  year  month  day  hour  pm2.5  DEWP  TEMP    PRES cbwd     Iws  \
0          1  2010      1    1     0    NaN   -21 -11.0  1021.0   NW    1.79   
1          2  2010      1    1     1    NaN   -21 -12.0  1020.0   NW    4.92   
2          3  2010      1    1     2    NaN   -21 -11.0  1019.0   NW    6.71   
3          4  2010      1    1     3    NaN   -21 -14.0  1019.0   NW    9.84   
4          5  2010      1    1     4    NaN   -20 -12.0  1018.0   NW   12.97   
...      ...   ...    ...  ...   ...    ...   ...   ...     ...  ...     ...   
43819  43820  2014     12   31    19    8.0   -23  -2.0  1034.0   NW  231.97   
43820  43821  2014     12   31    20   10.0   -22  -3.0  1034.0   NW  237.78   
43821  43822  2014     12   31    21   10.0   -22  -3.0  1034.0   NW  242.70   
43822  43823  2014     12   31    22    8.0   -22  -4.0  1034.0   NW  246.72   
43823  43824  2014     12   31    23   12.0   -21  -3.0  1034.0   NW  249.85   

       Is  Ir  
0       0   0  
1      

In [77]:
model_dict3 = {}
print(df3)
model_dict3[f"{df3.iloc[0,3]}_model"] = prod_model_generator3(df3)
print("----------")

           date   quarter  department        day  team  targeted_productivity  \
0      1/1/2015  Quarter1      sweing   Thursday     8                   0.80   
1      1/1/2015  Quarter1  finishing    Thursday     1                   0.75   
2      1/1/2015  Quarter1      sweing   Thursday    11                   0.80   
3      1/1/2015  Quarter1      sweing   Thursday    12                   0.80   
4      1/1/2015  Quarter1      sweing   Thursday     6                   0.80   
...         ...       ...         ...        ...   ...                    ...   
1192  3/11/2015  Quarter2   finishing  Wednesday    10                   0.75   
1193  3/11/2015  Quarter2   finishing  Wednesday     8                   0.70   
1194  3/11/2015  Quarter2   finishing  Wednesday     7                   0.65   
1195  3/11/2015  Quarter2   finishing  Wednesday     9                   0.75   
1196  3/11/2015  Quarter2   finishing  Wednesday     6                   0.70   

        smv     wip  over_t