In [1]:
import pandas as pd

In [2]:
# Load the dataset
df = pd.read_csv("C:/Users/MS Mthethwa/Desktop/Nana/City Development Index.csv.xls")

In [3]:
# Display the first few rows of the dataset
print("\nFirst 5 rows of the dataset:")
print(df.head())


First 5 rows of the dataset:
   No.          City                   Country  Region   CDI  City Product  \
0    1       Abidjan             C?te d'Ivoire  Africa  39.7          56.6   
1    2         Accra                     Ghana  Africa  46.6          49.4   
2    3  Antananarivo                Madagascar  Africa  34.5          44.4   
3    4        Bangui  Central African Republic  Africa  27.5          42.0   
4    5        Banjul                    Gambia  Africa  40.5          46.0   

   Infrastructure  Waste  Health  Education  Unnamed: 10  Unnamed: 11  \
0            21.7   29.0    94.6       42.4          NaN          NaN   
1            50.0    0.0    94.0       62.0          NaN          NaN   
2            22.5    0.0    92.7       52.5          NaN          NaN   
3            14.9    0.1    90.2       36.8          NaN          NaN   
4            16.0   48.0    87.8       37.8          NaN          NaN   

   Unnamed: 12  Unnamed: 13  Unnamed: 14  
0          NaN     

In [4]:
# Display of column names
print("Column names:")
print(df.columns)

Column names:
Index(['No.', 'City', 'Country', 'Region', 'CDI', 'City Product',
       'Infrastructure', 'Waste', 'Health', 'Education', 'Unnamed: 10',
       'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14'],
      dtype='object')


In [5]:
# size of the dataset
print("Dataset size:")
print(df.shape)

Dataset size:
(161, 15)


In [6]:
# Basic information about the dataset
print("Dataset Info:")
print(df.info())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161 entries, 0 to 160
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   No.             161 non-null    int64  
 1   City            161 non-null    object 
 2   Country         161 non-null    object 
 3   Region          161 non-null    object 
 4   CDI             161 non-null    float64
 5   City Product    161 non-null    float64
 6   Infrastructure  161 non-null    float64
 7   Waste           161 non-null    float64
 8   Health          161 non-null    float64
 9   Education       161 non-null    float64
 10  Unnamed: 10     0 non-null      float64
 11  Unnamed: 11     0 non-null      float64
 12  Unnamed: 12     0 non-null      float64
 13  Unnamed: 13     0 non-null      float64
 14  Unnamed: 14     0 non-null      float64
dtypes: float64(11), int64(1), object(3)
memory usage: 19.0+ KB
None


In [7]:
# Check for missing values
print("\nMissing values in every column:")
print(df.isnull().sum())


Missing values in every column:
No.                 0
City                0
Country             0
Region              0
CDI                 0
City Product        0
Infrastructure      0
Waste               0
Health              0
Education           0
Unnamed: 10       161
Unnamed: 11       161
Unnamed: 12       161
Unnamed: 13       161
Unnamed: 14       161
dtype: int64


In [8]:
# Rows with missing values
print("\nRows with missing values:")
print(df[df.isnull().any(axis=1)])


Rows with missing values:
     No.             City                   Country        Region   CDI  \
0      1          Abidjan             C?te d'Ivoire        Africa  39.7   
1      2            Accra                     Ghana        Africa  46.6   
2      3     Antananarivo                Madagascar        Africa  34.5   
3      4           Bangui  Central African Republic        Africa  27.5   
4      5           Banjul                    Gambia        Africa  40.5   
..   ...              ...                       ...           ...   ...   
156  158          Tbilisi                   Georgia  Transitional  72.2   
157  159           Troyan                  Bulgaria  Transitional  64.8   
158  160   Veliko Tarnovo                  Bulgaria  Transitional  71.5   
159  161  Veliky Novgorod        Russian Federation  Transitional  76.2   
160  162          Vilnius                 Lithuania  Transitional  83.3   

     City Product  Infrastructure  Waste  Health  Education  Unnamed: 10

In [9]:
import numpy as np

In [10]:
# Fill missing numerical values with the mean of the column
numerical_columns = df.select_dtypes(include=[np.number]).columns
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

# Fill missing categorical values with the mode of the column
categorical_columns = df.select_dtypes(include=[object]).columns
df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0])

# Verify if there are any missing values left
print("\nMissing values after handling:")
print(df.isnull().sum())


Missing values after handling:
No.                 0
City                0
Country             0
Region              0
CDI                 0
City Product        0
Infrastructure      0
Waste               0
Health              0
Education           0
Unnamed: 10       161
Unnamed: 11       161
Unnamed: 12       161
Unnamed: 13       161
Unnamed: 14       161
dtype: int64


In [11]:
from sklearn.preprocessing import StandardScaler

# Identify numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Fill missing values with mean (or any other appropriate method)
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

# Standardize the numerical columns
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Display the first few rows to verify changes
print("\nFirst 5 rows after standardization:")
print(df.head())



First 5 rows after standardization:
        No.          City                   Country  Region       CDI  \
0 -1.723859       Abidjan             C?te d'Ivoire  Africa -1.404385   
1 -1.702503         Accra                     Ghana  Africa -1.021756   
2 -1.681147  Antananarivo                Madagascar  Africa -1.692744   
3 -1.659791        Bangui  Central African Republic  Africa -2.080918   
4 -1.638436        Banjul                    Gambia  Africa -1.360023   

   City Product  Infrastructure     Waste    Health  Education  Unnamed: 10  \
0     -0.486458       -1.933768 -0.468143  1.179385  -1.516117          NaN   
1     -0.970204       -0.759414 -1.384951  1.138800  -0.547742          NaN   
2     -1.306138       -1.900571 -1.384951  1.050863  -1.017107          NaN   
3     -1.467387       -2.215945 -1.381789  0.881756  -1.792796          NaN   
4     -1.198639       -2.170299  0.132524  0.719412  -1.743389          NaN   

   Unnamed: 11  Unnamed: 12  Unnamed: 13  Unnamed

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [12]:
# Display summary statistics
summary_stats = df.describe()
print(summary_stats)

                No.           CDI  City Product  Infrastructure       Waste  \
count  1.610000e+02  1.610000e+02  1.610000e+02    1.610000e+02  161.000000   
mean   8.826618e-17 -9.267949e-16  3.971978e-16    2.647985e-16    0.000000   
std    1.003120e+00  1.003120e+00  1.003120e+00    1.003120e+00    1.003120   
min   -1.723859e+00 -2.402548e+00 -2.226599e+00   -2.639211e+00   -1.384951   
25%   -8.696230e-01 -6.446726e-01 -8.223927e-01   -7.594139e-01   -0.879126   
50%    5.969041e-03  1.704936e-01  3.088093e-02    2.697023e-01    0.037682   
75%    8.602052e-01  6.862111e-01  6.960312e-01    8.423556e-01    0.669963   
max    1.714441e+00  1.828553e+00  2.241330e+00    1.307118e+00    1.776454   

             Health     Education  Unnamed: 10  Unnamed: 11  Unnamed: 12  \
count  1.610000e+02  1.610000e+02          0.0          0.0          0.0   
mean   3.199649e-16  7.943956e-16          NaN          NaN          NaN   
std    1.003120e+00  1.003120e+00          NaN          NaN 

# data types that are used in relation to machine learning
-**No. (Number)**:This column likely represents a unique identifier for each entry in the dataset.
Data Type: Integer
City:
Represents the name of a city.
Data Type: String
Country:
Indicates the country to which the city belongs.
Data Type: String
-**Region**: Specifies the geographical region or continent where the city is located.
Data Type: String
-**CDI (City Development Index)**:Represents a numerical measure of a city's development level.
Data Type: Float or Integer
-**City Product**:Likely refers to the economic output or gross domestic product (GDP) of the city.
Data Type: Float or Integer
-**Infrastructure**:Represents the quality or level of infrastructure in the city (e.g., transportation, utilities).
Data Type: Float or Integer
-**Waste**:Indicates metrics related to waste management or environmental sustainability in the city.
Data Type: Float or Integer
-**Health**:Represents indicators of public health such as healthcare facilities, life expectancy, etc.
Data Type: Float or Integer
-**Education**:Indicates metrics related to education such as literacy rate, school enrollment, etc.
Data Type: Float or Integer
-**Unnamed: 10, Unnamed: 11, Unnamed: 12, Unnamed: 13, Unnamed: 14**: These columns seem to be unnamed and may contain additional information or could be empty columns.
Data Type: Depends on the content, could be Float, Integer, or String


In summary, machine learning and statistical datasets often contain a variety of data types including integers, floats, and strings. Understanding the data types of each column is crucial for data preprocessing, feature engineering, and model building in machine learning tasks.