In [49]:
import pandas as pd

# Load train datasets
survey_train = pd.read_csv("Surveydata_train.csv")
travel_train = pd.read_csv("Traveldata_train.csv")

# Load test datasets
survey_test = pd.read_csv("Surveydata_test.csv")
travel_test = pd.read_csv("Traveldata_test.csv")

# Merge train datasets
train_data = pd.merge(survey_train, travel_train, on="ID")

# Merge test datasets
test_data = pd.merge(survey_test, travel_test, on="ID")

# Display information about the merged datasets
print("Train Data Info:")
print(train_data.info())
print("\nTest Data Info:")
print(test_data.info())


Train Data Info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 94379 entries, 0 to 94378
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       94379 non-null  int64  
 1   Overall_Experience       94379 non-null  int64  
 2   Seat_comfort             94318 non-null  object 
 3   Seat_Class               94379 non-null  object 
 4   Arrival_time_convenient  85449 non-null  object 
 5   Catering                 85638 non-null  object 
 6   Platform_location        94349 non-null  object 
 7   Onboardwifi_service      94349 non-null  object 
 8   Onboard_entertainment    94361 non-null  object 
 9   Online_support           94288 non-null  object 
 10  Onlinebooking_Ease       94306 non-null  object 
 11  Onboard_service          86778 non-null  object 
 12  Leg_room                 94289 non-null  object 
 13  Baggage_handling         94237 non-null  object 
 14  Check

In [50]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define columns for different types of preprocessing
numeric_features = ['Age', 'Travel_Distance', 'DepartureDelay_in_Mins', 'ArrivalDelay_in_Mins']
categorical_features = ['Seat_comfort', 'Seat_Class', 'Arrival_time_convenient', 'Catering', 'Platform_location',
                        'Onboardwifi_service', 'Onboard_entertainment', 'Online_support', 'Onlinebooking_Ease',
                        'Onboard_service', 'Leg_room', 'Baggage_handling', 'Checkin_service', 'Cleanliness',
                        'Online_boarding', 'Gender', 'CustomerType', 'TypeTravel', 'Travel_Class']

# Define preprocessing steps for numeric and categorical features
numeric_transformer = SimpleImputer(strategy='median')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Apply preprocessing steps to the appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply preprocessing pipeline to train data
X_train = train_data.drop(columns=['ID', 'Overall_Experience'])
y_train = train_data['Overall_Experience']

# Apply preprocessing pipeline to test data
X_test = test_data.drop(columns=['ID'])


In [51]:
from sklearn.metrics import accuracy_score, classification_report

# Predict the target variable for the train data
y_train_pred = pipeline.predict(X_train)

# Evaluate the model on the train data
accuracy_train = accuracy_score(y_train, y_train_pred)
report_train = classification_report(y_train, y_train_pred)

# Print the evaluation results for train data
print("Train Data Evaluation:")
print("Accuracy:", accuracy_train)
print("Classification Report:\n", report_train)



Train Data Evaluation:
Accuracy: 0.8896576568940124
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.87      0.88     42786
           1       0.90      0.90      0.90     51593

    accuracy                           0.89     94379
   macro avg       0.89      0.89      0.89     94379
weighted avg       0.89      0.89      0.89     94379



In [52]:
# Predict the target variable for the test data
y_test_pred = pipeline.predict(X_test)

# Create a DataFrame to store the predictions along with the corresponding IDs
predictions_df = pd.DataFrame({'ID': test_data['ID'], 'Predicted_Overall_Experience': y_test_pred})

# Save the predictions to a CSV file
predictions_df.to_csv('predictions.csv', index=False)

print("Predictions saved to predictions.csv.")


Predictions saved to predictions.csv.


In [53]:
# Display the predictions DataFrame
print(predictions_df.head())


         ID  Predicted_Overall_Experience
0  99900001                             1
1  99900002                             0
2  99900003                             1
3  99900004                             0
4  99900005                             1


In [54]:
# Save the predictions to a CSV file
predictions_df.to_csv('predictions.csv', index=False)

print("Predictions saved to predictions.csv.")

Predictions saved to predictions.csv.


In [55]:
# Map predicted numerical values to labels
overall_experience_mapping = {
    1: 'delighted',
    0: 'disappointed'
}
predictions_df['Predicted_Overall_Experience'] = predictions_df['Predicted_Overall_Experience'].map(overall_experience_mapping)

# Save the predictions to a CSV file
predictions_df.to_csv('predictions.csv', index=False)

print("Predictions saved to predictions.csv.")


Predictions saved to predictions.csv.
