In [1]:
import pandas as pd

import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split


**Data Frame & Data split**

In [2]:

pd.set_option('display.max_rows', None)

# Load the data into a pandas dataframe
data = pd.read_csv("/content/drive/MyDrive/AMZ_ML/Dataset/datasetb2d9982/dataset/train.csv")

# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(data.drop("PRODUCT_LENGTH", axis=1), data["PRODUCT_LENGTH"], test_size=0.2)



In [None]:
data

1.   **Tokenization:** This involves splitting the text data into individual words,which will later be used as the features for the model.

2.   **Stopword Removal:** Stopwords are common words in the English language (e.g. 'the', 'and', 'a') that add little meaning to the text data. Removing them can improve the performance of the model.

3. **Stemming or Lemmatization:** These are techniques used to reduce the words to their base form (e.g. 'running' to 'run'). This can help reduce the number of unique features and improve the accuracy of the model.

4. **Vectorization:** Once the text data has been preprocessed, it needs to be transformed into a numerical representation that can be used by regression models. One common technique for this is using a bag-of-words model, where each word in the text is treated as a feature and its frequency in the document is used as its value.








**randomForest**

In [6]:
# Define the random forest regressor model
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)

# Fit the model on the training data
rf.fit(train_data, train_labels)

# Predict the product length for the test data
predictions = rf.predict(test_data)

# Calculate the mean absolute percentage error (MAPE) of the predictions
mape = mean_absolute_percentage_error(test_labels, predictions)

# Calculate the score using the specified evaluation metric
score = max(0, 100 * (1 - mape))

# Print the score
print("Score: %.2f" % score)


ValueError: ignored

**linearRegression**

In [None]:
# Define the linear regression model
lr = LinearRegression()

# Fit the model on the training data
lr.fit(train_data, train_labels)

# Predict the product length for the test data
predictions = lr.predict(test_data)

# Calculate the mean absolute percentage error (MAPE) of the predictions
mape = mean_absolute_percentage_error(test_labels, predictions)

# Calculate the score using the specified evaluation metric
score = max(0, 100 * (1 - mape))

# Print the score
print("Score: %.2f" % score)

**decisionTree**

In [None]:
# Define the decision tree regressor model
dt = DecisionTreeRegressor(max_depth=10, random_state=42)

# Fit the model on the training data
dt.fit(train_data, train_labels)

# Predict the product length for the test data
predictions = dt.predict(test_data)

# Calculate the mean absolute percentage error (MAPE) of the predictions
mape = mean_absolute_percentage_error(test_labels, predictions)

# Calculate the score using the specified evaluation metric
score = max(0, 100 * (1 - mape))

# Print the score
print("Score: %.2f" % score)

**xgboost**

In [None]:
# Define the XGBoost regressor model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', max_depth=5, n_estimators=100)

# Fit the model on the training data
xgb_model.fit(train_data, train_labels)

# Predict the product length for the test data
predictions = xgb_model.predict(test_data)

# Calculate the mean absolute percentage error (MAPE) of the predictions
mape = mean_absolute_percentage_error(test_labels, predictions)

# Calculate the score using the specified evaluation metric
score = max(0, 100 * (1 - mape))

# Print the score
print("Score: %.2f" % score)

**adaboost**

In [None]:
# Define the AdaBoost regressor model
ada_model = AdaBoostRegressor(n_estimators=50)

# Fit the model on the training data
ada_model.fit(train_data, train_labels)

# Predict the product length for the test data
predictions = ada_model.predict(test_data)

# Calculate the mean absolute percentage error (MAPE) of the predictions
mape = mean_absolute_percentage_error(test_labels, predictions)

# Calculate the score using the specified evaluation metric
score = max(0, 100 * (1 - mape))

# Print the score
print("Score: %.2f" % score)

**supportVector**

In [None]:
# Initialize the SVR model with default hyperparameters
svr_model = SVR(kernel = "rbf")

# Train the SVR model on the training data
svr_model.fit(train_data, train_labels)

# Make predictions on the test set
predictions = svr_model.predict(test_data)

# Evaluate the model using mean absolute percentage error
score = max(0, 100*(1-mean_absolute_percentage_error(test_labels, predictions)))
print('Score:', score)

**lassoRegression**

In [None]:
# Create and train the model
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(train_data, train_labels)

# Make predictions on the test set
predictions = lasso_model.predict(test_data)

# Calculate the evaluation metric
score = max(0, 100*(1-mean_absolute_percentage_error(test_labels, predictions)))

# Print the score
print("Lasso Regression score: {:.2f}".format(score))

# with preprocessing

# # Define the column transformer to encode the categorical variables
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('onehot', OneHotEncoder(handle_unknown='ignore'), ['PRODUCT_TYPE_ID'])
#     ])

# # Fit and transform the training data
# X_train_encoded = preprocessor.fit_transform(X_train)
# X_test_encoded = preprocessor.transform(X_test)

# # Scale the numerical variables using StandardScaler
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train[['PRODUCT_LENGTH']])
# X_test_scaled = scaler.transform(X_test[['PRODUCT_LENGTH']])

# # Combine the encoded and scaled features
# X_train_final = pd.concat([pd.DataFrame(X_train_encoded.toarray()), pd.DataFrame(X_train_scaled)], axis=1)
# X_test_final = pd.concat([pd.DataFrame(X_test_encoded.toarray()), pd.DataFrame(X_test_scaled)], axis=1)

# # Create and train the model
# lasso_model = Lasso(alpha=0.1)
# lasso_model.fit(X_train_final, y_train)

# # Make predictions on the test set
# y_pred = lasso_model.predict(X_test_final)

# # Calculate the evaluation metric
# score = max(0, 100*(1-mean_absolute_percentage_error(y_test, y_pred)))

# # Print the score
# print("Lasso Regression score: {:.2f}".format(score))