In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
df = pd.read_csv('/content/Disease-Weather-Uganda.csv')

In [None]:
df.head()

Unnamed: 0,location,total,preasure,rain,sun,humidity,mean_temp,max_temp,min_temp,wind_gust,mean_wind_spd,month,disease,ID
0,1,9,89.55631,95.702,246.883306,0.747509,24.548443,35.6,17.2,7.35,0.757137,April,Typhoid,1
1,1,143,89.55631,95.702,246.883306,0.747509,24.548443,35.6,17.2,7.35,0.757137,April,Malaria,2
2,1,13,89.55631,95.702,246.883306,0.747509,24.548443,35.6,17.2,7.35,0.757137,April,Dysentry,3
3,1,5,89.55631,95.702,246.883306,0.747509,24.548443,35.6,17.2,7.35,0.757137,April,Asthma,4
4,1,35,89.55631,95.702,246.883306,0.747509,24.548443,35.6,17.2,7.35,0.757137,April,Skin_diseases,5


In [None]:
print(df.describe())
print(df.isnull().sum())

         location        total    preasure        rain         sun  \
count  436.000000   436.000000  436.000000  436.000000  436.000000   
mean     4.853211   132.509174   88.929529  107.682704  201.952374   
std      2.565288   214.665369    1.001354   80.952700   45.404935   
min      1.000000     1.000000   87.086055    0.000000    0.000585   
25%      3.000000    10.000000   87.943684   36.126000  178.118634   
50%      5.000000    43.000000   88.888583  105.072000  198.167742   
75%      7.000000   143.000000   89.543826  170.956000  228.722782   
max      9.000000  1346.000000   90.968042  324.431000  335.412814   

         humidity   mean_temp    max_temp    min_temp   wind_gust  \
count  436.000000  436.000000  436.000000  436.000000  436.000000   
mean     0.781459   23.007329   31.508716   16.352982   13.964656   
std      0.122605    1.871782    3.108182    1.145432    9.970180   
min      0.376106   20.561499   26.700000   11.500000    4.470000   
25%      0.727037   21.5

In [None]:
categorical_features = ['location', 'month']
numerical_features = ['preasure', 'rain', 'sun', 'humidity', 'mean_temp', 'max_temp', 'min_temp', 'wind_gust', 'mean_wind_spd']

# Target variables
target_variable = 'total'

# Preprocessing Pipeline: One-hot encode categorical and standardize numerical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])


In [None]:
X = df[numerical_features + categorical_features]
y = df[target_variable].values

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Applying preprocessing
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

print("Training data shape: ", X_train.shape)
print("Test data shape: ", X_test.shape)


Training data shape:  (348, 30)
Test data shape:  (88, 30)


In [None]:
d=pd.read_csv("/content/expanded_dataset.csv")

In [None]:
print(d.duplicated().sum)

<bound method Series.sum of 0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool>


In [None]:
import pandas as pd
import numpy as np

# Load your existing dataset
# Replace 'your_existing_data.csv' with the actual path to your dataset
data = pd.read_csv('/content/Disease-Weather-Uganda.csv')

# Define the number of additional records needed
additional_records_needed = 1000 - len(data)

# For reproducibility, set a random seed
np.random.seed(0)

# Initialize an empty list to store the synthetic data
synthetic_data = []

# Generate synthetic data
for _ in range(additional_records_needed):
    # Randomly select an existing record for duplication and variation
    sample = data.sample(n=1).iloc[0]

    # Add some random noise to numeric variables
    new_record = {
        'location': sample['location'],
        'total': sample['total'] + np.random.randint(-5, 5),
        'preasure': sample['preasure'] + np.random.normal(0, 0.5),
        'rain': sample['rain'] + np.random.normal(0, 0.5),
        'sun': sample['sun'] + np.random.normal(0, 0.5),
        'humidity': sample['humidity'] + np.random.normal(0, 0.5),
        'mean_temp': sample['mean_temp'] + np.random.normal(0, 0.5),
        'max_temp': sample['max_temp'] + np.random.normal(0, 0.5),
        'min_temp': sample['min_temp'] + np.random.normal(0, 0.5),
        'wind_gust': sample['wind_gust'] + np.random.normal(0, 0.5),
        'mean_wind_spd': sample['mean_wind_spd'] + np.random.normal(0, 0.5),
        'month': np.random.choice(['April', 'May', 'June', 'July', 'August', 'September']),
        'disease': np.random.choice(['Asthma', 'Cholera', 'Dysentery', 'Guinea worm', 'Malaria', 'Skin diseases', 'Typhoid', 'Yellow fever']),
        'ID': max(data['ID']) + _ + 1  # Ensure unique IDs for synthetic data
    }

    # Append the new record to the synthetic data list
    synthetic_data.append(new_record)

# Convert synthetic data to a DataFrame
synthetic_df = pd.DataFrame(synthetic_data)

# Combine original and synthetic data
combined_data = pd.concat([data, synthetic_df], ignore_index=True)

# Save the combined dataset
combined_data.to_csv('expandedd_dataset.csv', index=False)

print(f"Dataset expanded to {len(combined_data)} records.")


Dataset expanded to 1000 records.
