Loading and Exploring Data:

In [9]:
import pandas as pd

# Load the dataset
daily_data = pd.read_csv('/daily_data.csv')

# Display basic statistics and check for missing values
print(daily_data.info())
print(daily_data.describe())
print(daily_data.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2893 entries, 0 to 2892
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   day_id                    2893 non-null   object 
 1   city_id                   2893 non-null   object 
 2   temperature_celsius       2893 non-null   float64
 3   condition_text            479 non-null    object 
 4   wind_kph                  2893 non-null   float64
 5   wind_degree               2893 non-null   int64  
 6   pressure_mb               2893 non-null   float64
 7   precip_mm                 2893 non-null   float64
 8   humidity                  2893 non-null   int64  
 9   cloud                     2893 non-null   int64  
 10  feels_like_celsius        2893 non-null   float64
 11  visibility_km             2893 non-null   float64
 12  uv_index                  2893 non-null   float64
 13  gust_kph                  2893 non-null   float64
 14  air_qual

Handling Missing Values:

In [17]:
import pandas as pd

# Load the dataset
daily_data = pd.read_csv('/daily_data.csv')

# Identify non-numeric columns
non_numeric_columns = daily_data.select_dtypes(exclude=['number']).columns

# Handle non-numeric columns (example: fill with an appropriate value)
daily_data[non_numeric_columns] = daily_data[non_numeric_columns].fillna('Unknown')

# Impute missing values with the mean for numeric columns only
numeric_data = daily_data.select_dtypes(include=['number'])
daily_data[numeric_data.columns] = numeric_data.fillna(numeric_data.mean())

# Check for missing values again
print(daily_data.isnull().sum())

day_id                      0
city_id                     0
temperature_celsius         0
condition_text              0
wind_kph                    0
wind_degree                 0
pressure_mb                 0
precip_mm                   0
humidity                    0
cloud                       0
feels_like_celsius          0
visibility_km               0
uv_index                    0
gust_kph                    0
air_quality_us-epa-index    0
sunrise                     0
sunset                      0
dtype: int64


Feature Engineering:

In [19]:
import pandas as pd

# Load the dataset
daily_data = pd.read_csv('/daily_data.csv')

# Verify if 'sales_value' column exists
if 'sales_value' in daily_data.columns:
    # Create new features based on existing ones
    daily_data['total_sales'] = daily_data['sales_value'] + daily_data['sales_volume']
    daily_data['sales_per_customer'] = daily_data['sales_value'] / daily_data['customer_count']

    # ... rest of your code ...
else:
    print("Error: 'sales_value' column not found in the DataFrame.")

Error: 'sales_value' column not found in the DataFrame.


Model Training and Evaluation:

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder for handling categorical features

# Prepare features and target
X = daily_data.drop(['day_id', 'condition_text', 'sunrise', 'sunset'], axis=1)
y = daily_data['condition_text']

# Encode categorical features (if any) and handle missing values
label_encoder = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object': # Check if column is of object type (likely categorical)
        X[col] = X[col].fillna('Unknown') # Fill missing values in categorical columns
        X[col] = label_encoder.fit_transform(X[col]) # Encode categorical features
    else: # Handle numeric columns
        X[col] = X[col].fillna(X[col].mean()) # Fill missing values in numeric columns with the mean

# Check if target variable has missing values and handle them (e.g., drop rows with missing target values)
# Align X and y after dropping missing values in 'y'
y_notna_index = y.notna() # Get indices of non-missing values in 'y'
y = y[y_notna_index] # Keep non-missing values in 'y'
X = X[y_notna_index] # Keep corresponding rows in 'X'

# Split data into training and testing sets (after handling missing values)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate model
y_pred = clf.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

Accuracy: 0.7395833333333334


In [27]:
# Predict missing values in the original dataset
# Use the entire feature set from 'daily_data' to get predictions for all rows
all_features = daily_data.drop(['day_id', 'condition_text', 'sunrise', 'sunset'], axis=1)

# Apply the same preprocessing steps as done before training
label_encoder = LabelEncoder()
for col in all_features.columns:
    if all_features[col].dtype == 'object':
        all_features[col] = all_features[col].fillna('Unknown')
        all_features[col] = label_encoder.fit_transform(all_features[col])
    else:
        all_features[col] = all_features[col].fillna(all_features[col].mean())

predictions = clf.predict(all_features)  # Predict on the entire dataset

# Prepare submission file
submission = pd.DataFrame({
    'day_id': daily_data['day_id'],
    'condition_text': predictions  # Use predictions for the entire dataset
})

# Save to CSV
submission.to_csv('/submission.csv', index=False)