In [1]:
from pyspark.sql import SparkSession    # main entry point for DataFrame and SQL functionality
from pyspark.sql.functions import col    # for returning a column based on a given column name
from pyspark.sql.functions import lit    # for adding a new column to PySpark DataFrame
from pyspark.ml.classification import LogisticRegression    # for classification model
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler    # for preparing data for classification
from pyspark.ml.evaluation import MulticlassClassificationEvaluator    # for evaluating classification models
from pyspark.ml import Pipeline
import pandas as pd   # for data frames
import numpy as np    # for arrays
import time           # for timing cells  
import matplotlib.pyplot as plt # plotting graphs

In [2]:
# Decision Tree
from pyspark.ml.classification import DecisionTreeClassifier

spark = SparkSession.builder.appName('Decision_Tree').getOrCreate()
spark

In [3]:
# Use file created in the earlier HW to model Decision Tree.

ratings_df = spark.read.csv('ratings.csv', header=True, inferSchema=True)   
ratings_df.count()

6000

In [4]:
ratings_columns = ratings_df.columns
pd.DataFrame(ratings_df.take(6000), columns=ratings_columns).groupby('ground_truth').count()
ratings_df.printSchema()

root
 |-- userID: integer (nullable = true)
 |-- trackID: integer (nullable = true)
 |-- ground_truth: integer (nullable = true)
 |-- album_score: integer (nullable = true)
 |-- artist_score: integer (nullable = true)
 |-- genre_score: double (nullable = true)



In [5]:
feature_columns = ['album_score', 'artist_score', 'genre_score']
stages = []
assembler_inputs = feature_columns
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol='features')    # merges multiple columns into a vector column
stages += [assembler]

label_column = 'ground_truth'
label_string_idx = StringIndexer(inputCol=label_column, outputCol='label')
stages += [label_string_idx]

pipeline = Pipeline(stages=stages)               # initialize the pipeline
pipeline_model = pipeline.fit(ratings_df)        # fit the pipeline model
train_df = pipeline_model.transform(ratings_df)  # transform the input DF with the pipeline model

In [6]:
selected_columns = ['label', 'features'] + ratings_columns
train_df = train_df.select(selected_columns)
train_df.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- userID: integer (nullable = true)
 |-- trackID: integer (nullable = true)
 |-- ground_truth: integer (nullable = true)
 |-- album_score: integer (nullable = true)
 |-- artist_score: integer (nullable = true)
 |-- genre_score: double (nullable = true)



In [7]:
pd.DataFrame(train_df.take(5), columns=train_df.columns).transpose()

Unnamed: 0,0,1,2,3,4
label,1,1,0,0,1
features,"[90.0, 50.0, 85.0]","[90.0, 0.0, 85.0]","(0.0, 0.0, 0.0)","[0.0, 0.0, 90.0]","[90.0, 50.0, 85.0]"
userID,200031,200031,200031,200031,200031
trackID,30877,8244,130183,198762,34503
ground_truth,1,1,0,0,1
album_score,90,90,0,0,90
artist_score,50,0,0,0,50
genre_score,85,85,0,90,85


In [8]:
train_df, test_df = train_df.randomSplit([0.7, 0.3], seed=2021)

In [9]:
print(f'Training Dataset Count: {train_df.count()}')
print(f'Test Dataset Count: {test_df.count()}')

Training Dataset Count: 4194
Test Dataset Count: 1806


In [10]:
# Load and prepare prediction file

prediction_df = spark.read.csv('genre_test.txt', sep='|', inferSchema=True)    # inferSchema scans csv twice to get datatype of each column
prediction_df = prediction_df.withColumnRenamed("_c0", "userID").withColumnRenamed("_c1", "trackID").withColumnRenamed("_c2", "albumScore").withColumnRenamed("_c3", "artistScore").withColumnRenamed("_c4", "genreScore")
prediction_df.count()

120000

In [11]:
prediction_columns = prediction_df.columns
prediction_columns

['userID', 'trackID', 'albumScore', 'artistScore', 'genreScore']

In [12]:
prediction_df = prediction_df.withColumn('prediction', lit('0'))
pd.DataFrame(prediction_df.take(5), columns=prediction_df.columns).transpose()

Unnamed: 0,0,1,2,3,4
userID,199810,199810,199810,199810,199810
trackID,208019,74139,9903,242681,18515
albumScore,0,0,0,0,0
artistScore,0,0,0,0,70
genreScore,0,80,0,0,0
prediction,0,0,0,0,0


In [13]:
prediction_df.printSchema()
feature_columns = ['albumScore', 'artistScore', 'genreScore']
stages = []
assembler_inputs = feature_columns
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol='features')    # merges multiple columns into a vector column
stages += [assembler]

root
 |-- userID: integer (nullable = true)
 |-- trackID: integer (nullable = true)
 |-- albumScore: integer (nullable = true)
 |-- artistScore: integer (nullable = true)
 |-- genreScore: double (nullable = true)
 |-- prediction: string (nullable = false)



In [14]:
label_column = 'prediction'
label_string_idx = StringIndexer(inputCol=label_column, outputCol='label')
stages += [label_string_idx]

prediction_pipeline = Pipeline(stages=stages)                       # initialize the pipeline
prediction_pipeline_model = prediction_pipeline.fit(prediction_df)  # fit the pipeline model
prediction_df = prediction_pipeline_model.transform(prediction_df)  # transform the input DF with the pipeline model

selected_columns = ['label', 'features'] + prediction_columns
prediction_df = prediction_df.select(selected_columns)
prediction_df.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- userID: integer (nullable = true)
 |-- trackID: integer (nullable = true)
 |-- albumScore: integer (nullable = true)
 |-- artistScore: integer (nullable = true)
 |-- genreScore: double (nullable = true)



In [15]:
pd.DataFrame(prediction_df.take(5), columns=prediction_df.columns).transpose()

Unnamed: 0,0,1,2,3,4
label,0,0,0,0,0
features,"(0.0, 0.0, 0.0)","[0.0, 0.0, 80.0]","(0.0, 0.0, 0.0)","(0.0, 0.0, 0.0)","[0.0, 70.0, 0.0]"
userID,199810,199810,199810,199810,199810
trackID,208019,74139,9903,242681,18515
albumScore,0,0,0,0,0
artistScore,0,0,0,0,70
genreScore,0,80,0,0,0


In [16]:
# Decision Tree

start_time = time.time()

dt = DecisionTreeClassifier(featuresCol='features', labelCol='label', maxDepth=20)
dt_model = dt.fit(train_df)

end_time = time.time()
elapsed_time = end_time - start_time
print(f'Done! Time elapsed - {elapsed_time:.2f} seconds.')

Done! Time elapsed - 8.03 seconds.


In [17]:
predictions_dt = dt_model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')    # initialize an Evaluator for Multiclass Classification
accuracy = evaluator.evaluate(predictions_dt)    # evaluate decision tree model on predictions
print(f'Test Error = {1.0 - accuracy:.2%}')

Test Error = 15.12%


In [18]:
sort_predictions_dt = predictions_dt.select('userID', 'trackID', 'label', 'probability', 'rawPrediction', 'prediction').sort(col('userID').asc(), col('probability').desc())
sort_predictions_dt.show(6)

+------+-------+-----+--------------------+--------------+----------+
|userID|trackID|label|         probability| rawPrediction|prediction|
+------+-------+-----+--------------------+--------------+----------+
|200031| 227283|  0.0|[0.19010416666666...|  [73.0,311.0]|       1.0|
|200031|  30877|  1.0|[0.02325581395348...|   [3.0,126.0]|       1.0|
|200031|  34503|  1.0|[0.02325581395348...|   [3.0,126.0]|       1.0|
|200032| 218377|  0.0|[0.84828807556080...|[1437.0,257.0]|       0.0|
|200032| 110262|  0.0|[0.84828807556080...|[1437.0,257.0]|       0.0|
|200055| 175557|  0.0|[0.84828807556080...|[1437.0,257.0]|       0.0|
+------+-------+-----+--------------------+--------------+----------+
only showing top 6 rows



In [19]:
dt_predictions = dt_model.transform(prediction_df)    # transform prediction_df with decision tree model
dt_predictions.select('userID', 'trackID', 'probability', 'rawPrediction', 'prediction').show(12)

+------+-------+--------------------+--------------+----------+
|userID|trackID|         probability| rawPrediction|prediction|
+------+-------+--------------------+--------------+----------+
|199810| 208019|[0.84828807556080...|[1437.0,257.0]|       0.0|
|199810|  74139|[0.72456575682382...| [292.0,111.0]|       0.0|
|199810|   9903|[0.84828807556080...|[1437.0,257.0]|       0.0|
|199810| 242681|[0.84828807556080...|[1437.0,257.0]|       0.0|
|199810|  18515|   [0.28125,0.71875]|    [9.0,23.0]|       1.0|
|199810| 105760|[0.19010416666666...|  [73.0,311.0]|       1.0|
|199812| 276940|[0.84828807556080...|[1437.0,257.0]|       0.0|
|199812| 142408|[0.00278940027894...|   [2.0,715.0]|       1.0|
|199812| 130023|[0.00278940027894...|   [2.0,715.0]|       1.0|
|199812|  29189|[0.72456575682382...| [292.0,111.0]|       0.0|
|199812| 223706|[0.13888888888888...|    [5.0,31.0]|       1.0|
|199812| 211361|[0.84828807556080...|[1437.0,257.0]|       0.0|
+------+-------+--------------------+---

In [20]:
# col('userID').asc() sort the user ascending
# col('probability').desc() sort the probability descending (from large to small)
sort_dt_predictions = dt_predictions.select('userID', 'trackID', 'probability', 'rawPrediction', 'prediction').sort(col('userID').asc(), col('probability').desc())
sort_dt_predictions.show(6)

+------+-------+--------------------+--------------+----------+
|userID|trackID|         probability| rawPrediction|prediction|
+------+-------+--------------------+--------------+----------+
|199810|   9903|[0.84828807556080...|[1437.0,257.0]|       0.0|
|199810| 208019|[0.84828807556080...|[1437.0,257.0]|       0.0|
|199810| 242681|[0.84828807556080...|[1437.0,257.0]|       0.0|
|199810|  74139|[0.72456575682382...| [292.0,111.0]|       0.0|
|199810|  18515|   [0.28125,0.71875]|    [9.0,23.0]|       1.0|
|199810| 105760|[0.19010416666666...|  [73.0,311.0]|       1.0|
+------+-------+--------------------+--------------+----------+
only showing top 6 rows



In [22]:
pd_sort_dt_predictions = sort_dt_predictions.toPandas().fillna(0.0)    # create pandas df
pd_sort_dt_predictions

Unnamed: 0,userID,trackID,probability,rawPrediction,prediction
0,199810,208019,"[0.8482880755608029, 0.15171192443919718]","[1437.0, 257.0]",0.0
1,199810,9903,"[0.8482880755608029, 0.15171192443919718]","[1437.0, 257.0]",0.0
2,199810,242681,"[0.8482880755608029, 0.15171192443919718]","[1437.0, 257.0]",0.0
3,199810,74139,"[0.7245657568238213, 0.27543424317617865]","[292.0, 111.0]",0.0
4,199810,18515,"[0.28125, 0.71875]","[9.0, 23.0]",1.0
...,...,...,...,...,...
119995,249010,86104,"[0.8482880755608029, 0.15171192443919718]","[1437.0, 257.0]",0.0
119996,249010,293818,"[0.8482880755608029, 0.15171192443919718]","[1437.0, 257.0]",0.0
119997,249010,110470,"[0.002789400278940028, 0.99721059972106]","[2.0, 715.0]",1.0
119998,249010,186634,"[0.002789400278940028, 0.99721059972106]","[2.0, 715.0]",1.0


In [23]:
columns_to_write = ['userID', 'trackID']
pd_sort_dt_predictions.to_csv('decision_tree_predictions.csv', index=False, header=None, columns=columns_to_write)    # write to csv (without header)

In [24]:
f_dt_predictions = open('decision_tree_predictions.csv')   
f_dt_final_predictions = open('decision_tree_final_predictions.csv', 'w')

In [25]:
# Write header
f_dt_final_predictions.write('TrackID,Predictor\n')

18

In [26]:
# Initialize some values
last_user_id = -1
track_id_out_vec = [0] * 6

start_time = time.time()

# Go through each line of the predictions file
for line in f_dt_predictions:
    arr_out = line.strip().split(',')    # remove any spaces/new lines and create list 
    user_id_out = arr_out[0]             # set user
    track_id_out = arr_out[1]            # set track
    
    if user_id_out != last_user_id:             # if new user reached
        i = 0                                   # reset i
        
    track_id_out_vec[i] = track_id_out          # add trackID to trackID array
        
    i = i + 1                    # increment i
    last_user_id = user_id_out   # set last_user_id as current userID
    
    if i == 6:                               # if last entry for current user reached
        # Here we set the predictions 
        predictions = np.ones(shape=(6)) # initialize numpy array for predictions
        for index in range(0, 3):            
            predictions[index] = 0           # set first 3 values in array to 0 (other 3 are 1)
        
        # Here we write to the final predictions file for the 6 track predictions for the current user
        for ii in range(0, 6):         
            out_str = str(user_id_out) + '_' + str(track_id_out_vec[ii]) + ',' + str(int(predictions[ii]))
            f_dt_final_predictions.write(out_str + '\n')

        
end_time = time.time()
elapsed_time = end_time - start_time
print(f'Done! Time elapsed - {elapsed_time:.2f} seconds.')

f_dt_predictions.close()
f_dt_final_predictions.close()

Done! Time elapsed - 2.09 seconds.


In [27]:
# Random Forest

from pyspark.ml.classification import RandomForestClassifier

start_time = time.time()

rf = RandomForestClassifier(featuresCol='features', labelCol='label')
rf_model = rf.fit(train_df)

end_time = time.time()
elapsed_time = end_time - start_time
print(f'Done! Time elapsed - {elapsed_time:.2f} seconds.')

predictions_rf = rf_model.transform(test_df)

evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')    # initialize an Evaluator for Multiclass Classification
accuracy = evaluator.evaluate(predictions_rf)    # evaluate random forest model on predictions
print(f'Test Error = {1.0 - accuracy:.2%}')

Done! Time elapsed - 3.27 seconds.
Test Error = 14.51%


In [28]:
sort_predictions_rf = predictions_rf.select('userID', 'trackID', 'label', 'probability', 'rawPrediction', 'prediction').sort(col('userID').asc(), col('probability').desc())
sort_predictions_rf.show(6)

+------+-------+-----+--------------------+--------------------+----------+
|userID|trackID|label|         probability|       rawPrediction|prediction|
+------+-------+-----+--------------------+--------------------+----------+
|200031| 227283|  0.0|[0.17659724265951...|[3.53194485319030...|       1.0|
|200031|  30877|  1.0|[0.02303244231395...|[0.46064884627903...|       1.0|
|200031|  34503|  1.0|[0.02303244231395...|[0.46064884627903...|       1.0|
|200032| 218377|  0.0|[0.83477151665023...|[16.6954303330047...|       0.0|
|200032| 110262|  0.0|[0.83477151665023...|[16.6954303330047...|       0.0|
|200055| 175557|  0.0|[0.83477151665023...|[16.6954303330047...|       0.0|
+------+-------+-----+--------------------+--------------------+----------+
only showing top 6 rows



In [29]:
rf_predictions = dt_model.transform(prediction_df)    # transform prediction_df with random forest model
rf_predictions.select('userID', 'trackID', 'probability', 'rawPrediction', 'prediction').show(12)

+------+-------+--------------------+--------------+----------+
|userID|trackID|         probability| rawPrediction|prediction|
+------+-------+--------------------+--------------+----------+
|199810| 208019|[0.84828807556080...|[1437.0,257.0]|       0.0|
|199810|  74139|[0.72456575682382...| [292.0,111.0]|       0.0|
|199810|   9903|[0.84828807556080...|[1437.0,257.0]|       0.0|
|199810| 242681|[0.84828807556080...|[1437.0,257.0]|       0.0|
|199810|  18515|   [0.28125,0.71875]|    [9.0,23.0]|       1.0|
|199810| 105760|[0.19010416666666...|  [73.0,311.0]|       1.0|
|199812| 276940|[0.84828807556080...|[1437.0,257.0]|       0.0|
|199812| 142408|[0.00278940027894...|   [2.0,715.0]|       1.0|
|199812| 130023|[0.00278940027894...|   [2.0,715.0]|       1.0|
|199812|  29189|[0.72456575682382...| [292.0,111.0]|       0.0|
|199812| 223706|[0.13888888888888...|    [5.0,31.0]|       1.0|
|199812| 211361|[0.84828807556080...|[1437.0,257.0]|       0.0|
+------+-------+--------------------+---

In [30]:
# col('userID').asc() sort the user ascending
# col('probability').desc() sort the probability descending (from large to small)
sort_rf_predictions = rf_predictions.select('userID', 'trackID', 'probability', 'rawPrediction', 'prediction').sort(col('userID').asc(), col('probability').desc())
sort_rf_predictions.show(6)

+------+-------+--------------------+--------------+----------+
|userID|trackID|         probability| rawPrediction|prediction|
+------+-------+--------------------+--------------+----------+
|199810|   9903|[0.84828807556080...|[1437.0,257.0]|       0.0|
|199810| 208019|[0.84828807556080...|[1437.0,257.0]|       0.0|
|199810| 242681|[0.84828807556080...|[1437.0,257.0]|       0.0|
|199810|  74139|[0.72456575682382...| [292.0,111.0]|       0.0|
|199810|  18515|   [0.28125,0.71875]|    [9.0,23.0]|       1.0|
|199810| 105760|[0.19010416666666...|  [73.0,311.0]|       1.0|
+------+-------+--------------------+--------------+----------+
only showing top 6 rows



In [31]:
pd_sort_rf_predictions = sort_rf_predictions.toPandas().fillna(0.0)    # create pandas df
pd_sort_rf_predictions

Unnamed: 0,userID,trackID,probability,rawPrediction,prediction
0,199810,208019,"[0.8482880755608029, 0.15171192443919718]","[1437.0, 257.0]",0.0
1,199810,9903,"[0.8482880755608029, 0.15171192443919718]","[1437.0, 257.0]",0.0
2,199810,242681,"[0.8482880755608029, 0.15171192443919718]","[1437.0, 257.0]",0.0
3,199810,74139,"[0.7245657568238213, 0.27543424317617865]","[292.0, 111.0]",0.0
4,199810,18515,"[0.28125, 0.71875]","[9.0, 23.0]",1.0
...,...,...,...,...,...
119995,249010,86104,"[0.8482880755608029, 0.15171192443919718]","[1437.0, 257.0]",0.0
119996,249010,293818,"[0.8482880755608029, 0.15171192443919718]","[1437.0, 257.0]",0.0
119997,249010,110470,"[0.002789400278940028, 0.99721059972106]","[2.0, 715.0]",1.0
119998,249010,186634,"[0.002789400278940028, 0.99721059972106]","[2.0, 715.0]",1.0


In [32]:
columns_to_write = ['userID', 'trackID']
pd_sort_rf_predictions.to_csv('random_forest_predictions.csv', index=False, header=None, columns=columns_to_write)    # write to csv (without headers)

In [33]:
f_rf_predictions = open('random_forest_predictions.csv')   
f_rf_final_predictions = open('random_forest_final_predictions.csv', 'w')

In [34]:
# Write header
f_rf_final_predictions.write('TrackID,Predictor\n')

18

In [35]:
# Initialize some values
last_user_id = -1
track_id_out_vec = [0] * 6

start_time = time.time()

# Go through each line of the predictions file
for line in f_rf_predictions:
    arr_out = line.strip().split(',')    # remove any spaces/new lines and create list 
    user_id_out = arr_out[0]             # set user
    track_id_out = arr_out[1]            # set track
    
    if user_id_out != last_user_id:             # if new user reached
        i = 0                                   # reset i
        
    track_id_out_vec[i] = track_id_out          # add trackID to trackID array
        
    i = i + 1                    # increment i
    last_user_id = user_id_out   # set last_user_id as current userID
    
    if i == 6:                               # if last entry for current user reached
        # Here we set the predictions 
        predictions = np.ones(shape=(6)) # initialize numpy array for predictions
        for index in range(0, 3):            
            predictions[index] = 0           # set first 3 values in array to 0 (other 3 are 1)
        
        # Here we write to the final predictions file for the 6 track predictions for the current user
        for ii in range(0, 6):         
            out_str = str(user_id_out) + '_' + str(track_id_out_vec[ii]) + ',' + str(int(predictions[ii]))
            f_rf_final_predictions.write(out_str + '\n')

        
end_time = time.time()
elapsed_time = end_time - start_time
print(f'Done! Time elapsed - {elapsed_time:.2f} seconds.')

f_rf_predictions.close()
f_rf_final_predictions.close()

Done! Time elapsed - 2.12 seconds.


In [36]:
# Gradient Boosted Tree

from pyspark.ml.classification import GBTClassifier

start_time = time.time()

gbt = GBTClassifier(maxIter=100)
gbt_model = gbt.fit(train_df)

end_time = time.time()
elapsed_time = end_time - start_time
print(f'Done! Time elapsed - {elapsed_time:.2f} seconds.')

predictions_gbt = gbt_model.transform(test_df)

evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')    # initialize an Evaluator for Multiclass Classification
accuracy = evaluator.evaluate(predictions_gbt)    # evaluate random forest model on predictions
print(f'Test Error = {1.0 - accuracy:.2%}')

Done! Time elapsed - 66.81 seconds.
Test Error = 15.01%


In [37]:
sort_predictions_gbt = predictions_gbt.select('userID', 'trackID', 'label', 'probability', 'rawPrediction', 'prediction').sort(col('userID').asc(), col('probability').desc())
sort_predictions_gbt.show(6)

gbt_predictions = gbt_model.transform(prediction_df)    # transform prediction_df with gradient-boosted tree model
gbt_predictions.select('userID', 'trackID', 'probability', 'rawPrediction', 'prediction').show(12)

+------+-------+-----+--------------------+--------------------+----------+
|userID|trackID|label|         probability|       rawPrediction|prediction|
+------+-------+-----+--------------------+--------------------+----------+
|200031| 227283|  0.0|[0.21518543762805...|[-0.6469736533077...|       1.0|
|200031|  30877|  1.0|[0.01421111170587...|[-2.1197090246712...|       1.0|
|200031|  34503|  1.0|[0.01421111170587...|[-2.1197090246712...|       1.0|
|200032| 218377|  0.0|[0.84760313000509...|[0.85796220711124...|       0.0|
|200032| 110262|  0.0|[0.84760313000509...|[0.85796220711124...|       0.0|
|200055| 175557|  0.0|[0.84760313000509...|[0.85796220711124...|       0.0|
+------+-------+-----+--------------------+--------------------+----------+
only showing top 6 rows

+------+-------+--------------------+--------------------+----------+
|userID|trackID|         probability|       rawPrediction|prediction|
+------+-------+--------------------+--------------------+----------+
|1998

In [38]:
# col('userID').asc() sort the user ascending
# col('probability').desc() sort the probability descending (from large to small)
sort_gbt_predictions = gbt_predictions.select('userID', 'trackID', 'probability', 'rawPrediction', 'prediction').sort(col('userID').asc(), col('probability').desc())
sort_gbt_predictions.show(6)

+------+-------+--------------------+--------------------+----------+
|userID|trackID|         probability|       rawPrediction|prediction|
+------+-------+--------------------+--------------------+----------+
|199810|   9903|[0.84760313000509...|[0.85796220711124...|       0.0|
|199810| 208019|[0.84760313000509...|[0.85796220711124...|       0.0|
|199810| 242681|[0.84760313000509...|[0.85796220711124...|       0.0|
|199810|  74139|[0.72350749410959...|[0.48096359351340...|       0.0|
|199810|  18515|[0.27996744400564...|[-0.4723115513229...|       1.0|
|199810| 105760|[0.20031666336396...|[-0.6921581944293...|       1.0|
+------+-------+--------------------+--------------------+----------+
only showing top 6 rows



In [39]:
pd_sort_gbt_predictions = sort_gbt_predictions.toPandas().fillna(0.0)    # create pandas df

In [40]:
pd_sort_gbt_predictions

Unnamed: 0,userID,trackID,probability,rawPrediction,prediction
0,199810,208019,"[0.8476031300050952, 0.15239686999490476]","[0.857962207111245, -0.857962207111245]",0.0
1,199810,9903,"[0.8476031300050952, 0.15239686999490476]","[0.857962207111245, -0.857962207111245]",0.0
2,199810,242681,"[0.8476031300050952, 0.15239686999490476]","[0.857962207111245, -0.857962207111245]",0.0
3,199810,74139,"[0.7235074941095933, 0.27649250589040675]","[0.48096359351340895, -0.48096359351340895]",0.0
4,199810,18515,"[0.27996744400564877, 0.7200325559943512]","[-0.4723115513229358, 0.4723115513229358]",1.0
...,...,...,...,...,...
119995,249010,86104,"[0.8476031300050952, 0.15239686999490476]","[0.857962207111245, -0.857962207111245]",0.0
119996,249010,293818,"[0.8476031300050952, 0.15239686999490476]","[0.857962207111245, -0.857962207111245]",0.0
119997,249010,110470,"[0.014470529025492772, 0.9855294709745073]","[-2.1105324653015916, 2.1105324653015916]",1.0
119998,249010,186634,"[0.014470529025492772, 0.9855294709745073]","[-2.1105324653015916, 2.1105324653015916]",1.0


In [41]:
columns_to_write = ['userID', 'trackID']
pd_sort_gbt_predictions.to_csv('grad_boost_predictions.csv', index=False, header=None, columns=columns_to_write)    # write to csv (without headers)

In [42]:
f_gbt_predictions = open('grad_boost_predictions.csv')   
f_gbt_final_predictions = open('grad_boost_final_predictions.csv', 'w')

In [43]:
# Write header
f_gbt_final_predictions.write('TrackID,Predictor\n')

18

In [44]:
# Initialize some values
last_user_id = -1
track_id_out_vec = [0] * 6

start_time = time.time()

# Go through each line of the predictions file
for line in f_gbt_predictions:
    arr_out = line.strip().split(',')    # remove any spaces/new lines and create list 
    user_id_out = arr_out[0]             # set user
    track_id_out = arr_out[1]            # set track
    
    if user_id_out != last_user_id:             # if new user reached
        i = 0                                   # reset i
        
    track_id_out_vec[i] = track_id_out          # add trackID to trackID array
        
    i = i + 1                    # increment i
    last_user_id = user_id_out   # set last_user_id as current userID
    
    if i == 6:                               # if last entry for current user reached
        # Here we set the predictions 
        predictions = np.ones(shape=(6)) # initialize numpy array for predictions
        for index in range(0, 3):            
            predictions[index] = 0           # set first 3 values in array to 0 (other 3 are 1)
        
        # Here we write to the final predictions file for the 6 track predictions for the current user
        for ii in range(0, 6):         
            out_str = str(user_id_out) + '_' + str(track_id_out_vec[ii]) + ',' + str(int(predictions[ii]))
            f_gbt_final_predictions.write(out_str + '\n')

        
end_time = time.time()
elapsed_time = end_time - start_time
print(f'Done! Time elapsed - {elapsed_time:.2f} seconds.')

f_gbt_predictions.close()
f_gbt_final_predictions.close()

Done! Time elapsed - 2.77 seconds.
