In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 1. Load dataset
data = pd.read_csv('covid_data.csv')

# 2. Handle Missing Values (Fill or Drop)
data = data.fillna(data.mode().iloc[0])  # Fill NaN with mode (most frequent value)

# 3. Define Features (X) and Target (y)
X = data.drop(columns=['TotalRecovered'])  # Remove target column from features
y = data['TotalRecovered']  # Extract target variable

# 4. Convert categorical columns (if any)
# One-hot encoding for categorical variables

# 5. Split dataset (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Standardize Numeric Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 7. Train Logistic Regression Model
model = LogisticRegression(max_iter=1000)  # Increased max iterations for convergence
model.fit(X_train, y_train)

# 8. Make Predictions
y_pred = model.predict(X_test)

# 9. Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred) * 100  # Convert to percentage
print(f"Model Accuracy: {accuracy:.2f}%")

# 10. Create a DataFrame with Predictions
predictions_df = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred})
print(predictions_df.head())  # Show first few predictions


ValueError: could not convert string to float: 'Estonia'

In [1]:
import pandas as pd
data = pd.read_csv('covid_data.csv')
df=pd.DataFrame(data)
df

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
0,Country I,Continent 4,13480506,254544,3909,13417,84,83723,1881,3587,17,18882.38,995.29,4106649,304636.12,Region 4
1,Country Q,Continent 5,11595832,978049,969,45460,48,920905,3284,8214,881,84344.87,3920.37,3669112,316416.45,Region 5
2,Country L,Continent 3,25426981,924471,4577,1246,75,657172,4065,8800,532,36357.88,49.00,1340598,52723.44,Region 4
3,Country P,Continent 6,10282643,483224,2641,11575,25,882976,4254,2630,117,46994.14,1125.68,3463137,336794.44,Region 2
4,Country S,Continent 4,26902325,39136,4023,46226,12,358445,679,5902,651,1454.74,1718.29,770403,28637.04,Region 6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,Country K,Continent 3,17832443,736619,909,43211,88,996323,484,8131,152,41307.80,2423.17,3273847,183589.37,Region 3
19996,Country F,Continent 6,8628220,816959,3449,43904,8,777100,3283,9723,833,94684.54,5088.42,1842256,213515.19,Region 6
19997,Country Q,Continent 4,9130302,548109,288,39844,59,148378,552,8194,385,60031.86,4363.93,270989,29680.18,Region 1
19998,Country Z,Continent 4,27972780,741265,656,8781,67,706165,180,2415,809,26499.51,313.91,400916,14332.36,Region 6


In [3]:
df.isnull().sum()

Country/Region      0
Continent           0
Population          0
TotalCases          0
NewCases            0
TotalDeaths         0
NewDeaths           0
TotalRecovered      0
NewRecovered        0
ActiveCases         0
Serious,Critical    0
Tot Cases/1M pop    0
Deaths/1M pop       0
TotalTests          0
Tests/1M pop        0
WHO Region          0
dtype: int64

In [1]:
import dash
from dash import dcc, html, Input, Output
import plotly.express as px
import pandas as pd

# Load the data
df = pd.read_csv('worldometer_data.csv')

app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("World Map Country Data Viewer"),

    dcc.Dropdown(
        id='country-dropdown',
        options=[{'label': country, 'value': country} for country in df['Country/Region']],
        placeholder="Select a country"
    ),

    html.Button('Show Data', id='show-button', n_clicks=0),

    html.Div(id='output-container', style={'margin-top': '20px'}),

    dcc.Graph(id='world-map')
])

@app.callback(
    Output('output-container', 'children'),
    Output('world-map', 'figure'),
    Input('show-button', 'n_clicks'),
    Input('country-dropdown', 'value')
)
def update_output(n_clicks, selected_country):
    if n_clicks > 0 and selected_country:
        population = df.loc[df['Country/Region'] == selected_country, 'ActiveCases'].values[0]
        total_deaths = df.loc[df['Country/Region'] == selected_country, 'TotalDeaths'].values[0]

        data_message = f"Country: {selected_country} Active Cases: {population} Total Deaths: {total_deaths}"

        # Create the choropleth figure
        fig = px.choropleth(
            df,
            locations='Country/Region',
            locationmode='country names',
            color='ActiveCases',
            hover_name='Country/Region',
            hover_data=['TotalDeaths'],
            color_continuous_scale=px.colors.sequential.Plasma,
            title='World Map: Active Cases by Country'
        )

        # Highlight the selected country
        if selected_country in df['Country/Region'].values:
            fig.add_trace(px.choropleth(
                df[df['Country/Region'] == selected_country],
                locations='Country/Region',
                locationmode='country names',
                color='ActiveCases',
                color_continuous_scale=['lightgrey', 'red'],
                title='Selected Country'
            ).data[0])

        fig.update_layout(
            width=1000,
            height=800
        )

        # Return the data message and the figure
        return data_message, fig

    return "Select a country and press 'Show Data'", {}

if __name__ == '__main__':
    app.run_server(debug=True)

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# 1. Load dataset
data = pd.read_csv('expanded_covid_dataset.csv')

# 2. Identify categorical and numerical columns
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols.remove('TotalRecovered')  # Remove target from numerical features

# 3. Preprocessing pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Use median for numerical missing values
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Use mode for categorical
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# 4. Define features and target
X = data.drop(columns=['TotalRecovered'])
y = data['TotalRecovered']

# 5. Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Create pipeline with model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=200, random_state=42))
])

# 7. Train the model
model.fit(X_train, y_train)

# 8. Predict and evaluate
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R-squared: {r2:.4f}")
print(f"MSE: {mse:.2f}")

# 9. Feature importance (if needed)
# rf = model.named_steps['regressor']
# feature_names = model.named_steps['preprocessor'].get_feature_names_out()
# importances = pd.Series(rf.feature_importances_, index=feature_names)
# print(importances.sort_values(ascending=False).head(10))

# 10. Create predictions DataFrame
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print("\nSample predictions:")
print(predictions_df.head().round(1))

R-squared: 0.9984
MSE: 160919433995.55

Sample predictions:
      Actual   Predicted
83   6953108   7013083.9
53  18519160  19299206.0
70  11878783  12153470.0
45  20996438  21147944.9
44  21026423  21202319.9


In [7]:
import pandas as pd
import numpy as np

# Configuration
np.random.seed(42)
num_countries = 100
continents = ["Asia", "Europe", "Africa", "North America", "South America", "Oceania"]
who_regions = ["EURO", "AMRO", "SEARO", "EMRO", "WPRO", "AFRO"]

# Generate country names using combinatorial approach
prefixes = ["New", "North", "South", "East", "West", "Central", "Upper", "Lower"]
roots = ["Ver", "Xan", "Zephyr", "Aqua", "Terra", "Solar", "Luna", "Nova"]
suffixes = ["land", "ia", "stan", "burg", "shire", "polis", "ville"]

countries = list(set([
    f"{np.random.choice(prefixes)} {np.random.choice(roots)}{np.random.choice(suffixes)}" 
    for _ in range(150)
]))[:num_countries]

# Generate base data
data = {
    "Country/Region": countries,
    "Continent": np.random.choice(continents, num_countries),
    "Population": np.random.randint(500_000, 2_000_000_000, num_countries),
    "TotalCases": np.random.randint(10_000, 50_000_000, num_countries),
    "TotalDeaths": lambda d: np.random.binomial(d.TotalCases, np.random.uniform(0.001, 0.1)),
    "TotalRecovered": lambda d: np.random.binomial(d.TotalCases, np.random.uniform(0.5, 0.95)),
    "TotalTests": lambda d: np.random.randint(d.Population//100, d.Population//10)
}

df = pd.DataFrame(data)
df = df.assign(
    TotalDeaths=data["TotalDeaths"](df),
    TotalRecovered=data["TotalRecovered"](df),
    TotalTests=data["TotalTests"](df)
)

# Derived columns
df["ActiveCases"] = df.TotalCases - df.TotalRecovered - df.TotalDeaths
df["NewCases"] = np.random.binomial(df.TotalCases, 0.001).astype(int)
df["NewDeaths"] = np.random.binomial(df.TotalDeaths, 0.005).astype(int)
df["NewRecovered"] = np.random.binomial(df.TotalRecovered, 0.01).astype(int)
df["Serious,Critical"] = (df.ActiveCases * np.random.uniform(0.01, 0.2)).astype(int)
df["Tot Cases/1M pop"] = (df.TotalCases / df.Population * 1_000_000).round(1)
df["Deaths/1M pop"] = (df.TotalDeaths / df.Population * 1_000_000).round(1)
df["Tests/1M pop"] = (df.TotalTests / df.Population * 1_000_000).round(1)
df["WHO Region"] = df.Continent.map({
    "Asia": np.random.choice(["SEARO", "WPRO", "EMRO"]),
    "Europe": "EURO",
    "Africa": "AFRO",
    "North America": "AMRO",
    "South America": "AMRO",
    "Oceania": "WPRO"
})

# Cleanup and validation
df = df[df.ActiveCases >= 0]  # Remove invalid rows
df = df.drop_duplicates(subset="Country/Region")
df = df.sort_values("TotalCases", ascending=False).reset_index(drop=True)

# Save to CSV
df.to_csv("expanded_covid_dataset.csv", index=False)
print(f"Generated {len(df)} countries\n")
print(df[["Country/Region", "Continent", "TotalCases", "TotalRecovered", "Tests/1M pop"]].head(10))

Generated 100 countries

     Country/Region      Continent  TotalCases  TotalRecovered  Tests/1M pop
0  Central Vershire  North America    49248027        35251381       77827.4
1    East Aquaville        Oceania    47921176        34299636       71336.1
2  North Solarpolis         Europe    47820594        34233225       59796.9
3    North Novaburg  North America    47695442        34141425       33797.3
4  Central Aquaburg  North America    47020676        33656962       64779.8
5   Lower Lunapolis           Asia    46987874        33640378       51878.4
6    North Xanpolis  North America    46683570        33416410       88089.0
7   North Novapolis  South America    46652343        33399500       60445.4
8    West Lunashire  North America    46557339        33326079       11655.3
9     East Novastan  North America    46343926        33176232       84798.4
