In [22]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import os
import ast
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

In [23]:
# Step 1: Initialize Spark Session
spark = SparkSession.builder \
    .appName("NLP Example with Spark") \
    .getOrCreate()

# Step 2: Load the dataset into a Spark DataFrame
input_dir = f'../input/data/training'
l2_data = spark.read.parquet(os.path.join(input_dir, 'l2_data_2024-08-20_08-36-29.parquet'))

# Step 3: NLP Transformations
# Tokenize and remove stopwords
tokenizer = Tokenizer(inputCol="review_text_translated", outputCol="words")
l2_data = tokenizer.transform(l2_data)

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
l2_data = remover.transform(l2_data)

# Compute TF and IDF
hashing_tf = HashingTF(inputCol="filtered", outputCol="raw_features", numFeatures=20)
l2_data = hashing_tf.transform(l2_data)

idf = IDF(inputCol="raw_features", outputCol="review_text_features")
idf_model = idf.fit(l2_data)
l3_data = idf_model.transform(l2_data)

# Drop intermediate columns
l3_data = l3_data.drop('words', 'filtered', 'raw_features')

l3_data.show()

                                                                                

+-------------------+--------------------+----------+------------------+-------------+-------------------+---------------+----------------------+-------------+-----------------+---------+----------------------+-----------------------+--------------------+
|             region|          hotel_name|avg_rating|user_ratings_total|review_rating|review_date_in_days|review_language|review_text_translated|review_length|         latitude|longitude|distance_to_ski_resort|distance_to_city_center|review_text_features|
+-------------------+--------------------+----------+------------------+-------------+-------------------+---------------+----------------------+-------------+-----------------+---------+----------------------+-----------------------+--------------------+
|             Ordino|        Borda del Pi|       4.4|               581|          5.0|                730|             en|  the outdoor bar w...|          336|       42.5679526|1.5965409|     6.893862270927463|       9.1073101806354

In [24]:
# Step 4: Convert Spark DataFrame to Pandas DataFrame
df = l3_data.toPandas()
df = df.drop(['review_text_translated'], axis=1)

df

                                                                                

Unnamed: 0,region,hotel_name,avg_rating,user_ratings_total,review_rating,review_date_in_days,review_language,review_length,latitude,longitude,distance_to_ski_resort,distance_to_city_center,review_text_features
0,Ordino,Borda del Pi,4.4,581,5.0,730,en,336,42.567953,1.596541,6.893862,9.107310,"(0.6923103386133578, 0.5469841770580245, 1.807..."
1,Encamp,Insitu Eurotel Andorra,3.9,1194,3.0,60,it,216,42.513766,1.533006,4.926610,1.180006,"(0.6923103386133578, 0.0, 0.6026631834477397, ..."
2,Encamp,Hotel Spa Termes Carlemany,4.2,1081,4.0,150,es,861,42.509345,1.544601,5.741960,1.934028,"(1.0384655079200367, 1.6409525311740736, 3.013..."
3,Ordino,Hotel Camp del Serrat,3.8,601,4.0,730,es,126,42.508312,1.551869,6.124637,2.522790,"(0.6923103386133578, 0.5469841770580245, 0.602..."
4,Escaldes-Engordany,Hotel Les Closes,4.3,1257,4.0,150,en,128,42.508618,1.537489,5.588254,1.346464,"(0.6923103386133578, 0.5469841770580245, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30753,Sant Julià de Lòria,Hotel Les Closes,4.3,1257,3.0,1460,en,173,42.508618,1.537489,5.588254,1.346464,"(0.0, 0.0, 0.6026631834477397, 0.3000363220801..."
30754,Canillo,Ona El Tarter,4.1,136,5.0,1460,en,32,42.581135,1.639033,9.017746,12.642774,"(0.3461551693066789, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
30755,La Massana,Rutllan,4.4,655,5.0,150,es,233,42.547381,1.513207,0.938402,4.448480,"(0.0, 0.5469841770580245, 1.2053263668954795, ..."
30756,Andorra la Vella,Hesperia Andorra (antiguo Hotel Fènix),4.2,1797,3.0,90,fr,724,42.509305,1.540308,5.601433,1.583385,"(1.7307758465333944, 2.7349208852901223, 1.205..."


In [26]:
import numpy as np

# Assuming df is your Pandas DataFrame and 'review_text_features' is the column with tuples
df['review_text_features'] = df['review_text_features'].apply(lambda x: np.array(x))
df

Unnamed: 0,region,hotel_name,avg_rating,user_ratings_total,review_rating,review_date_in_days,review_language,review_length,latitude,longitude,distance_to_ski_resort,distance_to_city_center,review_text_features
0,Ordino,Borda del Pi,4.4,581,5.0,730,en,336,42.567953,1.596541,6.893862,9.107310,"[0.6923103386133578, 0.5469841770580245, 1.807..."
1,Encamp,Insitu Eurotel Andorra,3.9,1194,3.0,60,it,216,42.513766,1.533006,4.926610,1.180006,"[0.6923103386133578, 0.0, 0.6026631834477397, ..."
2,Encamp,Hotel Spa Termes Carlemany,4.2,1081,4.0,150,es,861,42.509345,1.544601,5.741960,1.934028,"[1.0384655079200367, 1.6409525311740736, 3.013..."
3,Ordino,Hotel Camp del Serrat,3.8,601,4.0,730,es,126,42.508312,1.551869,6.124637,2.522790,"[0.6923103386133578, 0.5469841770580245, 0.602..."
4,Escaldes-Engordany,Hotel Les Closes,4.3,1257,4.0,150,en,128,42.508618,1.537489,5.588254,1.346464,"[0.6923103386133578, 0.5469841770580245, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30753,Sant Julià de Lòria,Hotel Les Closes,4.3,1257,3.0,1460,en,173,42.508618,1.537489,5.588254,1.346464,"[0.0, 0.0, 0.6026631834477397, 0.3000363220801..."
30754,Canillo,Ona El Tarter,4.1,136,5.0,1460,en,32,42.581135,1.639033,9.017746,12.642774,"[0.3461551693066789, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
30755,La Massana,Rutllan,4.4,655,5.0,150,es,233,42.547381,1.513207,0.938402,4.448480,"[0.0, 0.5469841770580245, 1.2053263668954795, ..."
30756,Andorra la Vella,Hesperia Andorra (antiguo Hotel Fènix),4.2,1797,3.0,90,fr,724,42.509305,1.540308,5.601433,1.583385,"[1.7307758465333944, 2.7349208852901223, 1.205..."


In [27]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming the desired vector length is the maximum length found in the data
max_len = max(df['review_text_features'].apply(len))

# Pad sequences to ensure all vectors have the same length
df['review_text_features'] = pad_sequences(df['review_text_features'], maxlen=max_len, padding='post', dtype='float32')

In [28]:
df

Unnamed: 0,region,hotel_name,avg_rating,user_ratings_total,review_rating,review_date_in_days,review_language,review_length,latitude,longitude,distance_to_ski_resort,distance_to_city_center,review_text_features
0,Ordino,Borda del Pi,4.4,581,5.0,730,en,336,42.567953,1.596541,6.893862,9.107310,0.692310
1,Encamp,Insitu Eurotel Andorra,3.9,1194,3.0,60,it,216,42.513766,1.533006,4.926610,1.180006,0.692310
2,Encamp,Hotel Spa Termes Carlemany,4.2,1081,4.0,150,es,861,42.509345,1.544601,5.741960,1.934028,1.038465
3,Ordino,Hotel Camp del Serrat,3.8,601,4.0,730,es,126,42.508312,1.551869,6.124637,2.522790,0.692310
4,Escaldes-Engordany,Hotel Les Closes,4.3,1257,4.0,150,en,128,42.508618,1.537489,5.588254,1.346464,0.692310
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30753,Sant Julià de Lòria,Hotel Les Closes,4.3,1257,3.0,1460,en,173,42.508618,1.537489,5.588254,1.346464,0.000000
30754,Canillo,Ona El Tarter,4.1,136,5.0,1460,en,32,42.581135,1.639033,9.017746,12.642774,0.346155
30755,La Massana,Rutllan,4.4,655,5.0,150,es,233,42.547381,1.513207,0.938402,4.448480,0.000000
30756,Andorra la Vella,Hesperia Andorra (antiguo Hotel Fènix),4.2,1797,3.0,90,fr,724,42.509305,1.540308,5.601433,1.583385,1.730776


24/08/20 22:17:39 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 215931 ms exceeds timeout 120000 ms
24/08/20 22:17:39 WARN SparkContext: Killing executors is not supported by current scheduler.
24/08/20 22:17:40 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$

In [25]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Convert the review_text_features column into a list of lists (or a 2D array)
df['review_text_features'] = df['review_text_features'].apply(lambda x: list(ast.literal_eval(x)))

# Determine the maximum length of the feature vectors (if not consistent)
max_len = max(df['review_text_features'].apply(len))

# Pad all vectors to the same length (max_len) with zeros
df['review_text_features'] = pad_sequences(df['review_text_features'], maxlen=max_len, dtype='float32', padding='post').tolist()

# Convert the column into a NumPy array
review_text_features_array = np.array(df['review_text_features'].tolist())

# Now you can drop the original column if needed
df = df.drop('review_text_features', axis=1)

# Example: Append the processed review_text_features back to your DataFrame or use directly in your model
df = np.hstack((df.values, review_text_features_array))

ValueError: malformed node or string: SparseVector(20, {0: 0.6923, 1: 0.547, 2: 1.808, 3: 0.3, 5: 1.7185, 6: 0.5691, 8: 1.084, 9: 0.3914, 10: 1.3526, 12: 1.5143, 13: 0.748, 16: 1.2268, 19: 0.7265})

In [9]:
# 1. One-Hot Encoding for the 'region' column
df_region_encoded = pd.get_dummies(df['region'], prefix='region')
df_region_encoded

Unnamed: 0,region_Andorra la Vella,region_Canillo,region_Encamp,region_Escaldes-Engordany,region_La Massana,region_Ordino,region_Sant Julià de Lòria
0,False,False,False,False,False,True,False
1,False,False,True,False,False,False,False
2,False,False,True,False,False,False,False
3,False,False,False,False,False,True,False
4,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...
30753,False,False,False,False,False,False,True
30754,False,True,False,False,False,False,False
30755,False,False,False,False,True,False,False
30756,True,False,False,False,False,False,False


In [10]:
# 2. Handling 'review_language'
# Calculate the percentage of each language
language_counts = df['review_language'].value_counts(normalize=True)

# Combine languages that are less than 2% present
languages_to_combine = language_counts[language_counts < 0.02].index
df['review_language'] = df['review_language'].apply(lambda x: 'other' if x in languages_to_combine else x)

# One-Hot Encode the remaining languages
df_language_encoded = pd.get_dummies(df['review_language'], prefix='lang')

df_language_encoded

Unnamed: 0,lang_ca,lang_en,lang_es,lang_fr,lang_other
0,False,True,False,False,False
1,False,False,False,False,True
2,False,False,True,False,False
3,False,False,True,False,False
4,False,True,False,False,False
...,...,...,...,...,...
30753,False,True,False,False,False
30754,False,True,False,False,False
30755,False,False,True,False,False
30756,False,False,False,True,False


In [11]:
# 3. Assigning IDs to 'hotel_name'
df['hotel_id'] = df['hotel_name'].astype('category').cat.codes

In [12]:
# Dropping the original 'hotel_name' and 'review_language' columns since we have encoded them
df = df.drop(['region', 'review_language', 'hotel_name'], axis=1)

# Concatenating the encoded columns with the original DataFrame
df = pd.concat([df, df_region_encoded, df_language_encoded], axis=1)

In [13]:
# Reordering columns: hotel_id first, avg_rating second
columns_order = ['hotel_id', 'avg_rating'] + [col for col in df.columns if col not in ['hotel_id', 'avg_rating']]
df = df[columns_order]
df

Unnamed: 0,hotel_id,avg_rating,user_ratings_total,review_rating,review_date_in_days,review_text_translated,review_length,latitude,longitude,distance_to_ski_resort,...,region_Encamp,region_Escaldes-Engordany,region_La Massana,region_Ordino,region_Sant Julià de Lòria,lang_ca,lang_en,lang_es,lang_fr,lang_other
0,26,4.4,581,5.0,730,the outdoor bar with seating that faced the va...,336,42.567953,1.596541,6.893862,...,False,False,False,True,False,False,True,False,False,False
1,131,3.9,1194,3.0,60,We stayed one night at this structure that was...,216,42.513766,1.533006,4.926610,...,True,False,False,False,False,False,False,False,False,True
2,116,4.2,1081,4.0,150,Good location two minutes walk from the main s...,861,42.509345,1.544601,5.741960,...,True,False,False,False,False,False,False,True,False,False
3,65,3.8,601,4.0,730,It is located in a quiet area perhaps someAbou...,126,42.508312,1.551869,6.124637,...,False,False,False,True,False,False,False,True,False,False
4,79,4.3,1257,4.0,150,stayed just 1night hotel was clean staff frien...,128,42.508618,1.537489,5.588254,...,False,True,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30753,79,4.3,1257,3.0,1460,hotel was ok but had badly broken sleep as we ...,173,42.508618,1.537489,5.588254,...,False,False,False,False,True,False,True,False,False,False
30754,140,4.1,136,5.0,1460,nice apartments in the mountains,32,42.581135,1.639033,9.017746,...,False,False,False,False,False,False,True,False,False,False
30755,145,4.4,655,5.0,150,Just at the LDO of the telecabine of the Palar...,233,42.547381,1.513207,0.938402,...,False,False,True,False,False,False,False,True,False,False
30756,52,4.2,1797,3.0,90,No parking we have this park 1 km aquatic corn...,724,42.509305,1.540308,5.601433,...,False,False,False,False,False,False,False,False,True,False


In [15]:
# 1. Remove the 'latitude' and 'longitude' columns
df = df.drop(['latitude', 'longitude'], axis=1)

In [17]:
# Assuming df is your DataFrame
def parse_review_features(feature_dict):
    # Directly convert the dictionary into a pandas Series
    return pd.Series(feature_dict)

# Apply the parsing function to each row in 'review_text_features'
df_review_features_parsed = df['review_text_features'].apply(parse_review_features)

df_review_features_parsed

Unnamed: 0,0
0,"(0.6923103386133578, 0.5469841770580245, 1.807..."
1,"(0.6923103386133578, 0.0, 0.6026631834477397, ..."
2,"(1.0384655079200367, 1.6409525311740736, 3.013..."
3,"(0.6923103386133578, 0.5469841770580245, 0.602..."
4,"(0.6923103386133578, 0.5469841770580245, 0.0, ..."
...,...
30753,"(0.0, 0.0, 0.6026631834477397, 0.3000363220801..."
30754,"(0.3461551693066789, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
30755,"(0.0, 0.5469841770580245, 1.2053263668954795, ..."
30756,"(1.7307758465333944, 2.7349208852901223, 1.205..."


In [None]:
# Drop the original 'review_text_features' column and concatenate the parsed features
df = df.drop('review_text_features', axis=1)
df = pd.concat([df, df_review_features_parsed], axis=1)
df

Unnamed: 0,hotel_id,avg_rating,user_ratings_total,review_rating,review_date_in_days,review_length,distance_to_ski_resort,distance_to_city_center,region_Andorra la Vella,region_Canillo,...,region_Sant Julià de Lòria,lang_ca,lang_en,lang_es,lang_fr,lang_other,type,size,indices,values
0,166,4.4,1729,5.0,120,115,5.326212,1.504967,False,False,...,False,False,False,True,False,False,0,20,"[0, 1, 3, 4, 5, 7, 8, 9, 15, 17]","[0.34615516930667883, 0.5469841770580245, 0.30..."
1,88,4.2,680,5.0,180,262,5.561925,5.103837,False,False,...,False,False,False,True,False,False,0,20,"[1, 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 14, 15, 1...","[2.187936708232098, 1.2053263668954795, 0.9001..."
2,143,4.7,196,4.0,1095,71,6.034156,5.964927,False,False,...,False,True,False,False,False,False,0,20,"[0, 4, 6, 9, 13, 16]","[0.6923103386133577, 0.3674755616406844, 1.138..."
3,72,4.1,1243,4.0,1460,5,5.592204,1.733360,False,False,...,False,False,False,False,False,True,0,20,"[1, 6]","[1.093968354116049, 0.5690804439716024]"
4,27,4.6,247,5.0,150,596,6.893862,9.107310,False,True,...,False,False,False,True,False,False,0,20,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14,...","[0.6923103386133577, 3.281905062348147, 1.2053..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30753,90,4.2,1595,5.0,180,148,5.330781,0.778600,False,False,...,False,False,True,False,False,False,0,20,"[0, 1, 2, 4, 5, 8, 9, 10, 12, 13, 15]","[0.34615516930667883, 0.5469841770580245, 1.20..."
30754,66,4.4,670,2.0,0,333,1.679513,5.399207,False,False,...,False,False,False,False,True,False,0,20,"[0, 2, 3, 5, 7, 8, 10, 11, 12, 13, 14, 18, 19]","[0.6923103386133577, 0.6026631834477397, 0.300..."
30755,6,4.2,600,4.0,120,315,3.425662,9.167398,False,False,...,False,False,False,True,False,False,0,20,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14,...","[1.0384655079200364, 1.6409525311740736, 1.807..."
30756,123,3.4,1240,4.0,0,502,5.572432,0.479916,False,False,...,False,False,False,True,False,False,0,20,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15,...","[0.34615516930667883, 1.093968354116049, 1.807..."


In [None]:
# List of numerical columns to normalize
numerical_columns = [
    'avg_rating', 
    'user_ratings_total', 
    'review_rating', 
    'review_date_in_days', 
    'review_length', 
    'distance_to_ski_resort', 
    'distance_to_city_center',
    'type',
    'size',
    'hotel_id'
]

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler to the data and transform the numerical columns
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
df

Unnamed: 0,hotel_id,avg_rating,user_ratings_total,review_rating,review_date_in_days,review_length,distance_to_ski_resort,distance_to_city_center,region_Andorra la Vella,region_Canillo,...,region_Sant Julià de Lòria,lang_ca,lang_en,lang_es,lang_fr,lang_other,type,size,indices,values
0,1.000000,0.76,0.512456,0.444444,0.019339,0.023486,0.538974,0.090672,False,False,...,False,False,False,True,False,False,0.0,0.0,"[0, 1, 3, 4, 5, 7, 8, 9, 15, 17]","[0.34615516930667883, 0.5469841770580245, 0.30..."
1,0.530120,0.68,0.201364,0.444444,0.029009,0.053770,0.565652,0.314792,False,False,...,False,False,False,True,False,False,0.0,0.0,"[1, 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 14, 15, 1...","[2.187936708232098, 1.2053263668954795, 0.9001..."
2,0.861446,0.88,0.057829,0.333333,0.176471,0.014421,0.619097,0.368417,False,False,...,False,True,False,False,False,False,0.0,0.0,"[0, 4, 6, 9, 13, 16]","[0.6923103386133577, 0.3674755616406844, 1.138..."
3,0.433735,0.64,0.368327,0.333333,0.235294,0.000824,0.569078,0.104896,False,False,...,False,False,False,False,False,True,0.0,0.0,"[1, 6]","[1.093968354116049, 0.5690804439716024]"
4,0.162651,0.84,0.072954,0.444444,0.024174,0.122579,0.716396,0.564109,False,True,...,False,False,False,True,False,False,0.0,0.0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14,...","[0.6923103386133577, 3.281905062348147, 1.2053..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30753,0.542169,0.68,0.472716,0.444444,0.029009,0.030284,0.539492,0.045438,False,False,...,False,False,True,False,False,False,0.0,0.0,"[0, 1, 2, 4, 5, 8, 9, 10, 12, 13, 15]","[0.34615516930667883, 0.5469841770580245, 1.20..."
30754,0.397590,0.76,0.198399,0.111111,0.000000,0.068397,0.126254,0.333187,False,False,...,False,False,False,False,True,False,0.0,0.0,"[0, 2, 3, 5, 7, 8, 10, 11, 12, 13, 14, 18, 19]","[0.6923103386133577, 0.6026631834477397, 0.300..."
30755,0.036145,0.68,0.177639,0.333333,0.019339,0.064689,0.323877,0.567851,False,False,...,False,False,False,True,False,False,0.0,0.0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14,...","[1.0384655079200364, 1.6409525311740736, 1.807..."
30756,0.740964,0.36,0.367438,0.333333,0.000000,0.103214,0.566841,0.026837,False,False,...,False,False,False,True,False,False,0.0,0.0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15,...","[0.34615516930667883, 1.093968354116049, 1.807..."


In [None]:
# 1. Prepare the data
# Convert boolean columns to integers (1 or 0)
df = df.copy()  # Ensure you are working with a fresh copy of your DataFrame
bool_columns = df.select_dtypes(include=['bool']).columns
df[bool_columns] = df[bool_columns].astype(int)

# Expand indices into separate columns
max_len_indices = df['indices'].apply(len).max()
max_len_values = df['values'].apply(len).max()

for i in range(max_len_indices):
    df[f'index_{i}'] = df['indices'].apply(lambda x: x[i] if i < len(x) else 0)

for i in range(max_len_values):
    df[f'value_{i}'] = df['values'].apply(lambda x: x[i] if i < len(x) else 0)

# Drop the original 'indices' and 'values' columns
df = df.drop(['indices', 'values'], axis=1)

# Convert all numerical data to float32 and handle any object type issues
df = df.apply(pd.to_numeric, errors='coerce')

# Replace NaN with 0 or another placeholder if necessary
df = df.fillna(0)
df

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split



# Convert to tensors
X = df.drop('avg_rating', axis=1).values
y = df['avg_rating'].values

# Splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train)

# Convert data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Check the shape of the tensors to confirm
print(X_train.shape, y_train.shape)

X_train

[[0.79518072 0.03202847 0.44444444 ... 0.         0.         0.        ]
 [0.75301205 0.07443654 0.44444444 ... 0.85351438 1.11452306 1.45300137]
 [0.24698795 0.09934757 0.44444444 ... 0.         0.         0.        ]
 ...
 [0.02409639 0.3054567  0.44444444 ... 0.         0.         0.        ]
 [0.78915663 0.35379597 0.44444444 ... 1.28027157 1.11452306 1.45300137]
 [0.51807229 0.14175563 0.44444444 ... 0.         0.         0.        ]]
torch.Size([24606, 61]) torch.Size([24606])


tensor([[0.7952, 0.0320, 0.4444,  ..., 0.0000, 0.0000, 0.0000],
        [0.7530, 0.0744, 0.4444,  ..., 0.8535, 1.1145, 1.4530],
        [0.2470, 0.0993, 0.4444,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0241, 0.3055, 0.4444,  ..., 0.0000, 0.0000, 0.0000],
        [0.7892, 0.3538, 0.4444,  ..., 1.2803, 1.1145, 1.4530],
        [0.5181, 0.1418, 0.4444,  ..., 0.0000, 0.0000, 0.0000]])

In [None]:
# 2. Define a Basic Neural Network
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

# 3. Training the model
input_dim = X_train.shape[1]
model = SimpleNN(input_dim)

# Loss and optimizer
criterion = nn.MSELoss()  # Assuming a regression problem
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    outputs = model(X_train)
    loss = criterion(outputs, y_train.unsqueeze(1))  # unsqueeze to match output dimensions
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluating the model
model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    test_loss = criterion(test_outputs, y_test.unsqueeze(1))
    print(f'Test Loss: {test_loss.item():.4f}')

Epoch [1/20], Loss: 0.8420
Epoch [2/20], Loss: 0.5318
Epoch [3/20], Loss: 0.3290
Epoch [4/20], Loss: 0.2354
Epoch [5/20], Loss: 0.2209
Epoch [6/20], Loss: 0.2426
Epoch [7/20], Loss: 0.2533
Epoch [8/20], Loss: 0.2441
Epoch [9/20], Loss: 0.2257
Epoch [10/20], Loss: 0.1908
Epoch [11/20], Loss: 0.1629
Epoch [12/20], Loss: 0.1327
Epoch [13/20], Loss: 0.1142
Epoch [14/20], Loss: 0.1003
Epoch [15/20], Loss: 0.0948
Epoch [16/20], Loss: 0.0919
Epoch [17/20], Loss: 0.0944
Epoch [18/20], Loss: 0.0945
Epoch [19/20], Loss: 0.0978
Epoch [20/20], Loss: 0.0969
Test Loss: 0.0557
