# Manipulating Data

In [None]:
"""
                    MANIPULATING DATA:
--> There are vectorized operations similar to NumPy arrays such as .count(), .sum(), .min(), .max(), .mean(), .std(), .prod()
    --> They can work across all elements of a DataFrame or along a specified axis.
--> There are regular arithmetic operators and NumPy universal functions such as np.abs()

                    OTHER METHODS include:
--> df.transpose() : returns a dataframe where rows become columns and columns become rows.
--> pd.to_numeric(errors='coerce') : allows us to coerce a series into a numerical series, i.e to int and float.
    --> Entries that cannot be converted result in an exception.
        --> We can override this behaviour by specifying the "errors" argument in the .to_numeric() function.
            --> df.to_numeric(errors='coerce') 
            
                CONCATENATING DATAFRAMES:
--> pd.concat(list_of_dataframes, axis=):
    --> axis=1 is horizontal concatenation
    --> axis=0 is vertical concatenation
    --> Pandas uses row or column index to "align" concatenated rows/columns.
        --> missing indexes of rows/columns are replaced with 'nan' when the dataframes are concatenated. This is called an "outer join"

OUTER JOIN:
--> In an outer join, the "missing" data in the join are replaced with NAN. They are the default kinds join.

INNER JOIN: pd.concat([df1, df2], axis=1, join='inner')
--> In an inner, "missing" rows/columns are dropped entirely.
"""

In [None]:
import pandas as pd
import numpy as np

In [None]:
m = np.array([
    [1, 10, 100],
    [2, 20, 200],
    [3, 30, 300],
    [4, 40, 400]
])

In [None]:
print(np.mean(m, axis=0))
print(np.mean(m, axis=1))

In [None]:
m.mean(axis=1)

In [None]:
df = pd.DataFrame(m,
                  index=['r0', 'r1', 'r2', 'r3'],
                  columns=['c0','c1', 'c2'])
print(df)

In [None]:
df.mean()
df.mean(axis=1)

In [None]:
df.sum()

In [None]:
df.sum(axis=1)

In [None]:
df['c0'] + df['c1']

In [None]:
df.loc[:, 'c0'] + df.iloc[:, 1]

In [None]:
np.sin(df)

In [None]:
df.transpose()

In [None]:
df = pd.read_csv('world_bank_countries.csv')

In [None]:
df['LatestPopulationCensus']

In [None]:
df.info()

In [None]:
df['LatestPopulationCensus'].describe()

In [None]:
print(df['LatestPopulationCensus'].unique())

In [None]:
try:
    pd.to_numeric(df['LatestPopulationCensus'])
except ValueError as ex:
    print('ValueError:', ex)

In [None]:
latest_census = pd.to_numeric(df['LatestPopulationCensus'], 
                              errors='coerce')
latest_census

In [None]:
try:
    latest_census.astype(int)
except ValueError as ex:
    print(f'ValueError: {ex}')

In [None]:
try:
    print(latest_census.astype('Int32'))
except ValueError as ex:
    print(f'ValueError: {ex}')

In [None]:
latest_census.dropna().astype(int)

In [78]:
latest_census[latest_census.notnull()].astype(int)

0      1979
1      2011
2      2008
3      2010
5      2014
       ... 
241    2010
242    2007
244    2004
245    2010
246    2012
Name: LatestPopulationCensus, Length: 208, dtype: int32

In [None]:
# CONCATENATING DATAFRAMES : Vertical and Horizontal Concatenation: be careful of row index
df_1 = pd.DataFrame(
    [
        [1, 2, 3],
        [2, 3, 4]
    ],
    index = ['r1', 'r2'],
    columns = ['c1', 'c2', 'c3']
)

df_2 = pd.DataFrame(
    [
        [10, 20],
        [20, 30]
    ],
    index = ['r1', 'r2'],
    columns = ['c10', 'c20']
)

In [None]:
pd.concat([df_1, df_2], axis=1)

In [None]:
df_1 = pd.DataFrame(
    [
        [1, 2, 3],
        [2, 3, 4]
    ],
    index = ['r1', 'r2'],
    columns = ['c1', 'c2', 'c3']
)

df_2 = pd.DataFrame(
    [
        [10, 20],
        [20, 30]
    ],
    index = ['r10', 'r2'],
    columns = ['c10', 'c20']
)

In [None]:
print(pd.concat([df_1, df_2], axis=1))

In [None]:
print(pd.concat([df_1, df_2], axis=0))

In [None]:
df_1 = pd.DataFrame(
    [
        [1, 2, 3],
        [2, 3, 4]
    ],
    index = ['r1', 'r2'],
    columns = ['c1', 'c2', 'c3']
)

df_2 = pd.DataFrame(
    [
        [10, 20],
        [20, 30]
    ],
    index = ['r1', 'r2'],
    columns = ['c1', 'c20']
)

In [None]:
pd.concat([df_1, df_2], axis=1)

In [None]:
data = pd.concat([df_1, df_2], axis=1)
data.columns = ['c1', 'c2', 'c3', 'c4', 'c5']
print(data.columns)

In [None]:
print(data)

In [46]:
df_1 = pd.DataFrame(
    [
        [1, 2, 3],
        [2, 3, 4]
    ],
    index = ['r1', 'r2'],
    columns = ['c1', 'c2', 'c3']
)

df_2 = pd.DataFrame(
    [
        [10, 20],
        [20, 30]
    ],
    index = ['r3', 'r4'],
    columns = ['c3', 'c4']
)

In [48]:
print(pd.concat([df_1, df_2], axis=0))

     c1   c2  c3    c4
r1  1.0  2.0   3   NaN
r2  2.0  3.0   4   NaN
r3  NaN  NaN  10  20.0
r4  NaN  NaN  20  30.0


In [62]:
latest_census = latest_census[latest_census.notnull()].astype(int)
latest_census

0      1979
1      2011
2      2008
3      2010
5      2014
       ... 
241    2010
242    2007
244    2004
245    2010
246    2012
Name: LatestPopulationCensus, Length: 208, dtype: int32

In [63]:
subset = df[['CountryCode', 'ShortName']]
print(subset)

    CountryCode           ShortName
0           AFG         Afghanistan
1           ALB             Albania
2           DZA             Algeria
3           ASM      American Samoa
4           ADO             Andorra
..          ...                 ...
242         WBG  West Bank and Gaza
243         WLD               World
244         YEM               Yemen
245         ZMB              Zambia
246         ZWE            Zimbabwe

[247 rows x 2 columns]


In [65]:
print(subset.sort_values(by='CountryCode', ascending=False))

    CountryCode        ShortName
246         ZWE         Zimbabwe
245         ZMB           Zambia
54          ZAR  Dem. Rep. Congo
199         ZAF     South Africa
244         YEM            Yemen
..          ...              ...
1           ALB          Albania
5           AGO           Angola
0           AFG      Afghanistan
4           ADO          Andorra
10          ABW            Aruba

[247 rows x 2 columns]


In [66]:
pd.concat([subset, latest_census], axis=1).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247 entries, 0 to 246
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   CountryCode             247 non-null    object 
 1   ShortName               247 non-null    object 
 2   LatestPopulationCensus  208 non-null    float64
dtypes: float64(1), object(2)
memory usage: 7.7+ KB


In [72]:
df = pd.read_csv('world_bank_countries.csv')
latest_census = pd.to_numeric(df['LatestPopulationCensus'], errors='coerce')
mask = latest_census.notnull()
subset = df.loc[:, ['CountryCode', 'ShortName']][mask]
print(subset)

    CountryCode           ShortName
0           AFG         Afghanistan
1           ALB             Albania
2           DZA             Algeria
3           ASM      American Samoa
5           AGO              Angola
..          ...                 ...
241         VIR      Virgin Islands
242         WBG  West Bank and Gaza
244         YEM               Yemen
245         ZMB              Zambia
246         ZWE            Zimbabwe

[208 rows x 2 columns]


In [73]:
latest_census.dropna().astype(int)

0      1979
1      2011
2      2008
3      2010
5      2014
       ... 
241    2010
242    2007
244    2004
245    2010
246    2012
Name: LatestPopulationCensus, Length: 208, dtype: int32

In [74]:
latest_census[mask].astype(int)

0      1979
1      2011
2      2008
3      2010
5      2014
       ... 
241    2010
242    2007
244    2004
245    2010
246    2012
Name: LatestPopulationCensus, Length: 208, dtype: int32

In [77]:
result = pd.concat([subset, latest_census.dropna().astype(int)], axis=1)
print(result)
print(result.info())

    CountryCode           ShortName  LatestPopulationCensus
0           AFG         Afghanistan                    1979
1           ALB             Albania                    2011
2           DZA             Algeria                    2008
3           ASM      American Samoa                    2010
5           AGO              Angola                    2014
..          ...                 ...                     ...
241         VIR      Virgin Islands                    2010
242         WBG  West Bank and Gaza                    2007
244         YEM               Yemen                    2004
245         ZMB              Zambia                    2010
246         ZWE            Zimbabwe                    2012

[208 rows x 3 columns]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 208 entries, 0 to 246
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CountryCode             208 non-null    obj

In [80]:
df = pd.read_csv('world_bank_countries.csv')
latest_census = pd.to_numeric(df['LatestPopulationCensus'], errors='coerce').dropna().astype(int)
subset = df[['CountryCode', 'ShortName']]
print(subset)

    CountryCode           ShortName
0           AFG         Afghanistan
1           ALB             Albania
2           DZA             Algeria
3           ASM      American Samoa
4           ADO             Andorra
..          ...                 ...
242         WBG  West Bank and Gaza
243         WLD               World
244         YEM               Yemen
245         ZMB              Zambia
246         ZWE            Zimbabwe

[247 rows x 2 columns]


In [81]:
print(latest_census)

0      1979
1      2011
2      2008
3      2010
5      2014
       ... 
241    2010
242    2007
244    2004
245    2010
246    2012
Name: LatestPopulationCensus, Length: 208, dtype: int32


In [84]:
pd.concat([subset, latest_census], axis=1,
          ).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247 entries, 0 to 246
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   CountryCode             247 non-null    object 
 1   ShortName               247 non-null    object 
 2   LatestPopulationCensus  208 non-null    float64
dtypes: float64(1), object(2)
memory usage: 7.7+ KB


In [85]:
pd.concat([subset, latest_census], axis=1, join='inner'
          ).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 208 entries, 0 to 246
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CountryCode             208 non-null    object
 1   ShortName               208 non-null    object
 2   LatestPopulationCensus  208 non-null    int32 
dtypes: int32(1), object(2)
memory usage: 5.7+ KB
