In [18]:
# This is only needed for tourism prediction project

In [19]:
# Tourist Count Prediction Project (Enhanced Version)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import joblib

# Step 1: Load the dataset
file_path = "India-Tourism-Statistics-2022-Table-2.1.4.csv"
df = pd.read_csv(file_path)

# Step 2: Clean the data
df = df[~df['Country of Nationality'].isin(['Total', 'Others'])]
df = df.dropna(subset=[
    'Number of Arrivals-2017',
    'Number of Arrivals-2018',
    'Number of Arrivals-2019',
    'Number of Arrivals-2020',
    'Number of Arrivals-2021'
])

# Step 3: Feature selection
features = [
    'Number of Arrivals-2017',
    'Number of Arrivals-2018',
    'Number of Arrivals-2019',
    'Number of Arrivals-2020'
]
target = 'Number of Arrivals-2021'
X = df[features]
y = df[target]

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Model selection and training
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    results[name] = (model, rmse)

# Select the best model
best_model_name = min(results, key=lambda k: results[k][1])
best_model, best_rmse = results[best_model_name]

# Save model and data for Flask app
joblib.dump(best_model, "best_model.pkl")
df.to_csv("cleaned_data.csv", index=False)
print(f"Model trained and saved as best_model.pkl. Best model: {best_model_name} with RMSE: {best_rmse:.2f}")


Model trained and saved as best_model.pkl. Best model: Random Forest with RMSE: 17053.76


In [20]:
!pip install plotly pycountry

^C



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting plotly
  Using cached plotly-6.1.2-py3-none-any.whl.metadata (6.9 kB)
Collecting pycountry
  Using cached pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Collecting narwhals>=1.15.1 (from plotly)
  Using cached narwhals-1.43.0-py3-none-any.whl.metadata (11 kB)
Using cached plotly-6.1.2-py3-none-any.whl (16.3 MB)
Using cached pycountry-24.6.1-py3-none-any.whl (6.3 MB)
Using cached narwhals-1.43.0-py3-none-any.whl (362 kB)
Installing collected packages: pycountry, narwhals, plotly
Successfully installed narwhals-1.43.0 plotly-6.1.2 pycountry-24.6.1
