In [5]:
import pandas as pd

# Path to your CSV file
file_path = '/kaggle/input/cluster/API_19_DS2_en_csv_v2_6300757.csv'

# Read the CSV file, skipping the first 4 lines which are metadata

def read_csv_custom(filepath, delimiter=',', header='infer', skiprows=None):
    """
    Custom function to read a CSV file into a pandas DataFrame with additional options.

    Parameters:
    - filepath (str): The path to the CSV file to be read. This is a mandatory parameter.
    - delimiter (str, optional): Character used to separate values in the file. Defaults to ','.
    - header (int, list of int, 'infer', optional): Row number(s) to use as column names. 
      Defaults to 'infer', which means that the first line of the file is used as column names. 
      If set to None, no header will be used.
    - skiprows (list-like, int or callable, optional): Line numbers to skip (0-indexed) or number 
      of lines to skip at the start of the file. Defaults to None.

    Returns:
    - pandas.DataFrame: DataFrame containing the data from the CSV file.

    Examples:
    >>> df1 = read_csv_custom('data.csv')
    >>> df2 = read_csv_custom('data.csv', delimiter=';', header=0)
    >>> df3 = read_csv_custom('data.csv', skiprows=2)

    Notes:
    - This function is a simple wrapper around pandas.read_csv() for more customized usage.
    - For more complex file reading scenarios, refer to pandas.read_csv documentation.
    """
    return pd.read_csv(filepath, delimiter=delimiter, header=header, skiprows=skiprows)
df = read_csv_custom(file_path, skiprows=4)
# Display the first few rows of the DataFrame
print(df.head())

   Aruba  ABW                                  Population, total  \
0  Aruba  ABW                       Population growth (annual %)   
1  Aruba  ABW  Poverty headcount ratio at $2.15 a day (2017 P...   
2  Aruba  ABW  Prevalence of underweight, weight for age (% o...   
3  Aruba  ABW        Community health workers (per 1,000 people)   
4  Aruba  ABW    Mortality rate, under-5 (per 1,000 live births)   

      SP.POP.TOTL  54608     55811     56682     57475     58178     58782  \
0     SP.POP.GROW    NaN  2.179059  1.548572  1.389337  1.215721  1.032841   
1     SI.POV.DDAY    NaN       NaN       NaN       NaN       NaN       NaN   
2  SH.STA.MALN.ZS    NaN       NaN       NaN       NaN       NaN       NaN   
3  SH.MED.CMHW.P3    NaN       NaN       NaN       NaN       NaN       NaN   
4     SH.DYN.MORT    NaN       NaN       NaN       NaN       NaN       NaN   

   ...    102880    103594    104257    104874    105439    105962   106442  \
0  ...  0.749301  0.691615  0.637959  0.590

In [6]:
import csv

def read_csv_custom(filepath, delimiter=',', skip_header=False):
    """
    Custom function to read a CSV file into a list of dictionaries without using Pandas.

    Parameters:
    - filepath (str): The path to the CSV file to be read.
    - delimiter (str, optional): Character used to separate values in the file. Defaults to ','.
    - skip_header (bool, optional): Whether to skip the first row (header) of the CSV file. 
      Defaults to False.

    Returns:
    - list of dict: A list where each element is a dictionary representing a row in the CSV file. 
      Column headers are used as keys.

    Example:
    >>> data = read_csv_custom('data.csv')
    >>> data = read_csv_custom('data.csv', delimiter=';', skip_header=True)

    Note:
    - This function does not handle complex CSV features like multi-line fields.
    """
    data = []
    with open(filepath, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile, delimiter=delimiter)
        if skip_header:
            next(reader, None)  # Skip the header row
        for row in reader:
            data.append(row)
    return data

# Example usage of the function
data = read_csv_custom(file_path)
# df = pd.DataFrame(data)
# print(df.head())


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20212 entries, 0 to 20211
Data columns (total 67 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Aruba              20212 non-null  object 
 1   ABW                20212 non-null  object 
 2   Population, total  20212 non-null  object 
 3   SP.POP.TOTL        20212 non-null  object 
 4   54608              2490 non-null   float64
 5   55811              4069 non-null   float64
 6   56682              4103 non-null   float64
 7   57475              4120 non-null   float64
 8   58178              4161 non-null   float64
 9   58782              4190 non-null   float64
 10  59291              4196 non-null   float64
 11  59522              4201 non-null   float64
 12  59471              4212 non-null   float64
 13  59330              4217 non-null   float64
 14  59106              4757 non-null   float64
 15  58816              6097 non-null   float64
 16  58855              611

In [9]:
data= df.dropna()
print("The total number of data-points after removing the rows with missing values are:", len(data))

The total number of data-points after removing the rows with missing values are: 995


In [10]:
data.describe()

Unnamed: 0,54608,55811,56682,57475,58178,58782,59291,59522,59471,59330,...,102880,103594,104257,104874,105439,105962,106442,106585,106537,106445
count,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,...,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0,995.0
mean,40487320.0,41196090.0,42088940.0,43150650.0,44228430.0,45264670.0,46339610.0,47421820.0,48534120.0,49683460.0,...,117390500.0,119323000.0,121258600.0,123199000.0,125147000.0,127075400.0,128974400.0,130825700.0,132539000.0,134129800.0
std,204333500.0,207124000.0,211072200.0,216077200.0,221123600.0,226138500.0,231399600.0,236652500.0,242064600.0,247689100.0,...,543420000.0,551297800.0,559134700.0,566956700.0,574807900.0,582506800.0,590023500.0,597262500.0,603902400.0,609860900.0
min,0.2921227,0.3150861,0.3397334,0.366188,0.3947513,0.4254729,0.4586037,0.5152986,0.5819781,0.6571884,...,0.03437522,0.03454085,0.03261351,0.03158219,0.03077838,0.03013746,0.03219431,0.03289014,0.0314737,0.02810989
25%,24.32069,25.51793,25.33068,25.63816,25.83755,26.20884,26.46,26.803,27.1672,27.57101,...,38.51158,39.18462,39.75837,40.13324,40.51654,40.844,41.4577,41.63499,42.04863,42.50646
50%,9446.0,10050.0,10532.0,11004.0,11465.0,11952.0,12452.0,12967.0,13597.0,14600.0,...,36806.0,37685.0,38503.0,38398.0,39844.0,40895.0,41239.0,41445.0,42050.0,41569.0
75%,2762589.0,2819761.0,2891214.0,2981509.0,3060961.0,3130878.0,3189342.0,3257154.0,3356073.0,3402207.0,...,8112978.0,8362674.0,8546595.0,8731039.0,8962744.0,9134150.0,9274330.0,9349608.0,9540646.0,9600274.0
max,3031474000.0,3072422000.0,3126850000.0,3193429000.0,3260442000.0,3328209000.0,3398480000.0,3468371000.0,3540164000.0,3614573000.0,...,7229303000.0,7317040000.0,7403850000.0,7490415000.0,7576442000.0,7660371000.0,7741775000.0,7820206000.0,7888306000.0,7950947000.0


In [17]:
data

Unnamed: 0,Aruba,ABW,"Population, total",SP.POP.TOTL,54608,55811,56682,57475,58178,58782,...,102880,103594,104257,104874,105439,105962,106442,106585,106537,106445
72,Africa Eastern and Southern,AFE,Urban population (% of total population),SP.URB.TOTL.IN.ZS,1.456381e+01,1.481141e+01,1.506925e+01,1.534798e+01,1.564019e+01,1.594128e+01,...,3.347621e+01,3.393696e+01,3.442558e+01,3.489475e+01,3.535890e+01,3.584760e+01,3.633626e+01,3.682830e+01,3.732370e+01,3.782516e+01
73,Africa Eastern and Southern,AFE,Urban population,SP.URB.TOTL,1.903382e+07,1.987235e+07,2.077079e+07,2.173742e+07,2.277306e+07,2.387085e+07,...,1.901088e+08,1.980735e+08,2.065564e+08,2.150834e+08,2.237322e+08,2.329223e+08,2.424511e+08,2.523155e+08,2.623771e+08,2.726661e+08
75,Africa Eastern and Southern,AFE,"Population, total",SP.POP.TOTL,1.306926e+08,1.341692e+08,1.378356e+08,1.416305e+08,1.456060e+08,1.497424e+08,...,5.678921e+08,5.836511e+08,6.000084e+08,6.163776e+08,6.327466e+08,6.497571e+08,6.672430e+08,6.851130e+08,7.029771e+08,7.208591e+08
91,Africa Eastern and Southern,AFE,Population in urban agglomerations of more tha...,EN.URB.MCTY.TL.ZS,6.444405e+00,6.564937e+00,6.690929e+00,6.815871e+00,6.947877e+00,7.085666e+00,...,1.365722e+01,1.381416e+01,1.393429e+01,1.406590e+01,1.420788e+01,1.433700e+01,1.445516e+01,1.458105e+01,1.472133e+01,1.487672e+01
148,Afghanistan,AFG,Urban population (% of total population),SP.URB.TOTL.IN.ZS,8.401000e+00,8.684000e+00,8.976000e+00,9.276000e+00,9.586000e+00,9.904000e+00,...,2.437300e+01,2.458700e+01,2.480300e+01,2.502000e+01,2.525000e+01,2.549500e+01,2.575400e+01,2.602600e+01,2.631400e+01,2.661600e+01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20079,Zambia,ZMB,Population in urban agglomerations of more tha...,EN.URB.MCTY.TL.ZS,2.915340e+00,3.143517e+00,3.389273e+00,3.661470e+00,3.998799e+00,4.365472e+00,...,1.304941e+01,1.324996e+01,1.346104e+01,1.368156e+01,1.391039e+01,1.415037e+01,1.439908e+01,1.465646e+01,1.492310e+01,1.519552e+01
20136,Zimbabwe,ZWE,Urban population (% of total population),SP.URB.TOTL.IN.ZS,1.260800e+01,1.282100e+01,1.308200e+01,1.357800e+01,1.409200e+01,1.462000e+01,...,3.265400e+01,3.250400e+01,3.238500e+01,3.229600e+01,3.223700e+01,3.220900e+01,3.221000e+01,3.224200e+01,3.230300e+01,3.239500e+01
20137,Zimbabwe,ZWE,Urban population,SP.URB.TOTL,4.799000e+05,5.033460e+05,5.297920e+05,5.672790e+05,6.074120e+05,6.501730e+05,...,4.426387e+06,4.503674e+06,4.584076e+06,4.667645e+06,4.755312e+06,4.848158e+06,4.945719e+06,5.052214e+06,5.166388e+06,5.287038e+06
20139,Zimbabwe,ZWE,"Population, total",SP.POP.TOTL,3.806310e+06,3.925952e+06,4.049778e+06,4.177931e+06,4.310332e+06,4.447149e+06,...,1.355542e+07,1.385575e+07,1.415494e+07,1.445270e+07,1.475110e+07,1.505218e+07,1.535461e+07,1.566967e+07,1.599352e+07,1.632054e+07


In [18]:
import csv
import pandas as pd

def transpose_csv(df):
    """
    Reads a CSV file into a DataFrame, transposes it, and returns the transposed DataFrame.

    Parameters:
    - dataframe

    Returns:
    - pandas.DataFrame: Transposed DataFrame.

    Example:
    >>> transposed_df = transpose_csv(df)
    """


    transposed_df = df.T  # or df.transpose()

    return transposed_df

# Example usage
transposed_df = transpose_csv(data)


In [20]:
transposed_df

Unnamed: 0,72,73,75,91,148,149,151,167,224,225,...,20003,20060,20061,20063,20071,20079,20136,20137,20139,20155
Aruba,Africa Eastern and Southern,Africa Eastern and Southern,Africa Eastern and Southern,Africa Eastern and Southern,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Africa Western and Central,Africa Western and Central,...,South Africa,Zambia,Zambia,Zambia,Zambia,Zambia,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe
ABW,AFE,AFE,AFE,AFE,AFG,AFG,AFG,AFG,AFW,AFW,...,ZAF,ZMB,ZMB,ZMB,ZMB,ZMB,ZWE,ZWE,ZWE,ZWE
"Population, total",Urban population (% of total population),Urban population,"Population, total",Population in urban agglomerations of more tha...,Urban population (% of total population),Urban population,"Population, total",Population in urban agglomerations of more tha...,Urban population (% of total population),Urban population,...,Population in urban agglomerations of more tha...,Urban population (% of total population),Urban population,"Population, total","Agriculture, forestry, and fishing, value adde...",Population in urban agglomerations of more tha...,Urban population (% of total population),Urban population,"Population, total",Population in urban agglomerations of more tha...
SP.POP.TOTL,SP.URB.TOTL.IN.ZS,SP.URB.TOTL,SP.POP.TOTL,EN.URB.MCTY.TL.ZS,SP.URB.TOTL.IN.ZS,SP.URB.TOTL,SP.POP.TOTL,EN.URB.MCTY.TL.ZS,SP.URB.TOTL.IN.ZS,SP.URB.TOTL,...,EN.URB.MCTY.TL.ZS,SP.URB.TOTL.IN.ZS,SP.URB.TOTL,SP.POP.TOTL,NV.AGR.TOTL.ZS,EN.URB.MCTY.TL.ZS,SP.URB.TOTL.IN.ZS,SP.URB.TOTL,SP.POP.TOTL,EN.URB.MCTY.TL.ZS
54608,14.56381,19033821.0,130692579.0,6.444405,8.401,724373.0,8622466.0,3.309401,14.705391,14301917.0,...,24.323394,18.145,566021.0,3119430.0,11.460629,2.91534,12.608,479900.0,3806310.0,6.514919
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105962,35.847598,232922327.0,649757148.0,14.337003,25.495,9353296.0,36686784.0,10.935191,46.649426,206492202.0,...,35.578753,43.521,7762359.0,17835893.0,3.341124,14.150365,32.209,4848158.0,15052184.0,10.065091
106442,36.336259,242451140.0,667242986.0,14.455163,25.754,9727157.0,37769499.0,10.892466,47.255413,214684207.0,...,35.870818,44.072,8100644.0,18380477.0,2.860775,14.399077,32.21,4945719.0,15354608.0,9.907847
106585,36.828302,252315481.0,685112979.0,14.581055,26.026,10142913.0,38972230.0,10.832154,47.857831,223107995.0,...,36.176168,44.629,8447250.0,18927715.0,2.977086,14.65646,32.242,5052214.0,15669666.0,9.763578
106537,37.323699,262377061.0,702977106.0,14.721332,26.314,10551772.0,40099462.0,10.812539,48.454806,231704052.0,...,36.544603,45.192,8800295.0,19473125.0,3.000918,14.923095,32.303,5166388.0,15993524.0,9.640302


In [None]:
import pandas as pd

def read_clean_and_transpose_df(data):
    """
    Converts a list of dictionaries or a list of lists to a DataFrame, 
    performs basic cleaning tasks, and transposes it.

    The cleaning tasks include dropping rows with missing (NA) values.

    Parameters:
    - data (list of dict or list of lists): The data to be converted to a DataFrame, cleaned, and transposed.

    Returns:
    - pandas.DataFrame: Cleaned and transposed DataFrame.

    Example:
    >>> data = [['A', 'B', 'C'], [1, 2, 3], [4, 5, None]]
    >>> cleaned_transposed_df = read_clean_and_transpose_df(data)
    """

    # Convert data to DataFrame
    if isinstance(data[0], dict):
        # If data is a list of dictionaries
        df = pd.DataFrame(data)
    else:
        # If data is a list of lists
        df = pd.DataFrame(data[1:], columns=data[0])

    # Drop rows with missing values
    df_cleaned = df.dropna()
    

    # Transpose the DataFrame
    transposed_df = df_cleaned.T

    return transposed_df

# Example usage
# Assuming 'data' is your list
cleaned_transposed_df = read_clean_and_transpose_df(df)


In [19]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

# Assuming X is your data
X=transposed_df
# Normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Using the Elbow Method to find the optimal number of clusters
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

# Plotting the results
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

# Choose the number of clusters (k) and fit the model
k = 3  # Example number, replace with the chosen k
kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42)
y_kmeans = kmeans.fit_predict(X_scaled)

# Calculating silhouette score
silhouette_avg = silhouette_score(X_scaled, y_kmeans)
print(f"The average silhouette score for {k} clusters is: {silhouette_avg}")

# Additional analysis and interpretation...


ValueError: could not convert string to float: 'Africa Eastern and Southern'