In [1]:
!pip install mlflow
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor, LinearRegression, GBTRegressor
from pyspark.ml.feature import PCA, VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
#Load the feature table
train_df=spark.read.parquet('/FileStore/my-stuff/df_train_final.parquet')
display(train_df)

In [3]:
#Drop Redundant features
train_df=train_df.drop('created_utc','domain','subreddit','idx','index','timestamp','subreddit_rev','DOW','DOW_value','whitelist_statusIndex','whitelist_status_ohe','binned_score','DOW_Index','resultIndex','lastIndex','protocolIndex','result1Index','subreddit_rev_Index','subreddit_typeIndex','domaintfidf','total_author_score_ntile_ohe','subreddit_author_count','total_subreddit_posts','titlewordCount','avg_author_score_ntile_ohe','result_ohe','subreddit_rev_ohe','last_ohe','protocol_ohe','created_hour')
display(train_df)

In [4]:
X_train=train_df.drop('score')

In [5]:
vect_asmb=VectorAssembler(inputCols=X_train.columns,outputCol='features')
training_df=vect_asmb.transform(train_df).select('features','score')
display(training_df)

In [6]:
test_df=spark.read.parquet('/FileStore/my-stuff/df_test_final.parquet')
display(test_df)

In [7]:
#Drop Redundant features
test_df=test_df.drop('created_utc','domain','subreddit','idx','index','timestamp','subreddit_rev','DOW','DOW_value','whitelist_statusIndex','whitelist_status_ohe','binned_score','DOW_Index','resultIndex','lastIndex','protocolIndex','result1Index','subreddit_rev_Index','subreddit_typeIndex','domaintfidf','total_author_score_ntile_ohe','subreddit_author_count','total_subreddit_posts','titlewordCount','avg_author_score_ntile_ohe','result_ohe','subreddit_rev_ohe','last_ohe','protocol_ohe','created_hour')
display(test_df)

In [8]:
X_test=test_df.drop('score')

In [9]:
vect_asmb_2=VectorAssembler(inputCols=X_test.columns,outputCol='features')
testing_df=vect_asmb_2.transform(test_df).select('features','score')
display(testing_df)

In [10]:
#Linear Regression Model
lr=LinearRegression(featuresCol='features',labelCol='score',maxIter=175, regParam=0.5, elasticNetParam=0.1)
model_2=lr.fit(training_df)
train_pred_df=model_2.transform(training_df)
display(train_pred_df)

In [11]:
prediction_df_2=model_2.transform(testing_df)
display(prediction_df_2)

In [12]:
eval_1 = RegressionEvaluator(labelCol="score", predictionCol="prediction", metricName="rmse")
rmse_2=eval_1.evaluate(prediction_df_2)
train_rmse=eval_1.evaluate(train_pred_df)
print('RMSE value of base linear regression model on test:',rmse_2)
print('RMSE value of base linear regression model on train:',train_rmse)

In [13]:
eval_2 = RegressionEvaluator(labelCol="score", predictionCol="prediction", metricName="r2")
r2_lr=eval_2.evaluate(prediction_df_2)
train_r2_lr=eval_2.evaluate(train_pred_df)
print('R-square value of linear regression model on test:', r2_lr)
print('R-square value of linear regression model on train:', train_r2_lr)

In [14]:
paramGrid_2 = ParamGridBuilder().addGrid(lr.maxIter,[50,75,100,125,150]).addGrid(lr.regParam,[0.3,0.5,0.7,0.8,0.9,1]).addGrid(lr.elasticNetParam,[0.05,0.1,0.2,0.3,0.5]).build()
ev=RegressionEvaluator(labelCol="score", predictionCol="prediction")
crossval_lr = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid_2, evaluator=ev, numFolds=5, parallelism=10)

In [15]:
cv_2=crossval_lr.fit(training_df)
train_cv2=cv_2.transform(training_df)
prediction_cv2=cv_2.transform(testing_df)
display(prediction_cv2)

In [16]:
rmse_cv2_train=eval_1.evaluate(train_cv2)
print('RMSE value of tuned linear regression model for train:', rmse_cv2_train)
r2_cv2_train=eval_2.evaluate(train_cv2)
print('R-square value of tuned linear regression model for train:', r2_cv2_train)

In [17]:
rmse_cv2=eval_1.evaluate(prediction_cv2)
print('RMSE value of tuned linear regression model for test:', rmse_cv2)
r2_cv2=eval_2.evaluate(prediction_cv2)
print('R-square value of tuned linear regression model for test:', r2_cv2)

In [18]:
best_model2 = cv_2.bestModel

In [19]:
best_model2.extractParamMap()