# Geographic Manipulation

* Reduce the number of categories (regions) within the Comp Terr column
* Rename the column for simplification
* Check the Null Values

## Read in the current dataset 

* Check the number of categories within the dataset
* Determine the unified categories for simplification

In [2]:
import pandas as pd

In [4]:
file_path = 'final_data_with_origin_and_calculate.csv'
dataset_og = pd.read_csv(file_path)
print(dataset_og.head())


   Unnamed: 0       C1AccountNo        CXRecords Comp Terr   Latitude  Year  \
0           0  A2061733054*,CNR  91470CV(.{82 W<      MCOW   0.000000  2018   
1           1  A2061733056$=L._  91470FS$KD_> W<   Western  37.773032  2018   
2           2  A2061733057#*N-2  91470GQ%![I! W<   Central  42.135659  2018   
3           3  A2061733058)U90^  91470I8%.+ML W<   Western  33.436405  2018   
4           4  A2061733059#5)>2  91470KJ(,V-0 W<   Central   0.000000  2018   

         1         2       3    4  ...      q2      q3       q4     sales  \
0     0.00      0.00    0.00  0.0  ...    0.00    0.00     0.00      0.00   
1     0.00      0.00    0.00  0.0  ...    0.00  639.00  1438.00   4154.00   
2  1730.06  10023.32  108.18  0.0  ...    0.00    0.00  1143.74  26010.60   
3     0.00      0.00    0.00  0.0  ...  306.86    0.00   277.38   1168.48   
4     0.00      0.00    0.00  0.0  ...    0.00  153.46     0.00    306.92   

   individual  q1_calculated  q2_calculated  q3_calculated  q4

In [5]:
for column in dataset_og.columns:
    print(column)

Unnamed: 0
C1AccountNo
CXRecords
Comp Terr
Latitude
Year
1
2
3
4
5
6
7
8
9
10
11
12
q1
q2
q3
q4
sales
individual
q1_calculated
q2_calculated
q3_calculated
q4_calculated
individual_calculated


In [6]:
# Count the rows for each year
years = [2018, 2019, 2020, 2021, 2022, 2023, 2024]

for year in years:
    count = dataset_og[dataset_og['Year'] == year].shape[0]
    print(f"Number of rows for the year {year}: {count}")

Number of rows for the year 2018: 5700
Number of rows for the year 2019: 6201
Number of rows for the year 2020: 5573
Number of rows for the year 2021: 5898
Number of rows for the year 2022: 6450
Number of rows for the year 2023: 7236
Number of rows for the year 2024: 4983


In [7]:
# Drop index column
df = dataset_og.drop(columns=['Unnamed: 0'])

# Rename the Company Terr column to Territory
df = df.rename(columns={'Comp Terr': 'Territory'})

print(df.head())

        C1AccountNo        CXRecords Territory   Latitude  Year        1  \
0  A2061733054*,CNR  91470CV(.{82 W<      MCOW   0.000000  2018     0.00   
1  A2061733056$=L._  91470FS$KD_> W<   Western  37.773032  2018     0.00   
2  A2061733057#*N-2  91470GQ%![I! W<   Central  42.135659  2018  1730.06   
3  A2061733058)U90^  91470I8%.+ML W<   Western  33.436405  2018     0.00   
4  A2061733059#5)>2  91470KJ(,V-0 W<   Central   0.000000  2018     0.00   

          2       3    4       5  ...      q2      q3       q4     sales  \
0      0.00    0.00  0.0    0.00  ...    0.00    0.00     0.00      0.00   
1      0.00    0.00  0.0    0.00  ...    0.00  639.00  1438.00   4154.00   
2  10023.32  108.18  0.0    0.00  ...    0.00    0.00  1143.74  26010.60   
3      0.00    0.00  0.0  306.86  ...  306.86    0.00   277.38   1168.48   
4      0.00    0.00  0.0    0.00  ...    0.00  153.46     0.00    306.92   

   individual  q1_calculated  q2_calculated  q3_calculated  q4_calculated  \
0        

In [8]:
# Check the number of categories (regions) in the Territory Column
num_categories = df['Territory'].nunique()
print(f"Number of unique categories in 'Territory': {num_categories}")

unique_categories = df['Territory'].unique()
print(f"Unique categories in 'Territory': {unique_categories}")

Number of unique categories in 'Territory': 15
Unique categories in 'Territory': ['MCOW' 'Western' 'Central' 'Midwest' 'Northeast' 'Southern' 'Minnesota'
 'INTL' 'Unknown' 'California' 'East' 'West_x000D_\n' 'MN-West'
 'Washington' 'North']


## Regroup the Categories 

* West: Includes territories such as 'Western', 'West_x000D_\n', 'California', 'Washington'.
* Midwest: Includes 'Midwest', 'Minnesota', 'Central', 'MN-West'.
* Northeast: Includes 'Northeast', 'East'.
* North: Include 'North'.
* South: Includes 'Southern'.
* International: Includes 'INTL'.
* Unknown: Includes 'Unknown'.
* Special: Includes 'MCOW' (as we do not know what MCOW mean right now)

In [9]:
# Define a mapping for the new grouped categories
territory_mapping = {
    'MCOW': 'Special',  
    'Western': 'West',
    'West_x000D_\n': 'West',
    'California': 'West',
    'Washington': 'West',
    'Central': 'Midwest',
    'Midwest': 'Midwest',
    'Minnesota': 'Midwest',
    'MN-West': 'Midwest',
    'Northeast': 'Northeast',
    'East': 'Northeast',
    'Southern': 'South',
    'INTL': 'International',
    'Unknown': 'Unknown',
    'North': 'North'
}

df['Territory_Group'] = df['Territory'].map(territory_mapping)

unique_categories_group = df['Territory_Group'].unique()
num_categories_group = df['Territory_Group'].value_counts()
print(unique_categories_group)
print(num_categories_group)


['MCOW' 'Western' 'Central' 'Midwest' 'Northeast' 'Southern' 'Minnesota'
 'INTL' 'Unknown' 'California' 'East' 'West_x000D_\n' 'MN-West'
 'Washington' 'North']
Midwest          13774
South             9447
West              8538
Northeast         8378
Unknown           1080
International      804
Special             19
North                1
Name: Territory_Group, dtype: int64


In [10]:
unique_categories_group = df['Territory_Group'].unique()
print(unique_categories_group)

['Special' 'West' 'Midwest' 'Northeast' 'South' 'International' 'Unknown'
 'North']


## Check the number of Null Values in Terriry

In [11]:
missing_per_column = df.isnull().sum()
print("Missing values for each column:")
print(missing_per_column)

Missing values for each column:
C1AccountNo              0
CXRecords                0
Territory                0
Latitude                 0
Year                     0
1                        0
2                        0
3                        0
4                        0
5                        0
6                        0
7                        0
8                        0
9                        0
10                       0
11                       0
12                       0
q1                       0
q2                       0
q3                       0
q4                       0
sales                    0
individual               0
q1_calculated            0
q2_calculated            0
q3_calculated            0
q4_calculated            0
individual_calculated    0
Territory_Group          0
dtype: int64


In [12]:
total_missing = df.isnull().sum().sum()
print(f"Total missing values in the dataset: {total_missing}")

Total missing values in the dataset: 0


In [13]:
# Check how many accounts with unknown Territory for each year
unknown_counts_per_year = df[df['Territory'] == 'Unknown'].groupby('Year').size()
print("Count of 'Unknown' categories in 'Territory' for each year:")
print(unknown_counts_per_year)

Count of 'Unknown' categories in 'Territory' for each year:
Year
2018     16
2019     11
2020      7
2021    363
2022    288
2023    225
2024    170
dtype: int64


## Export the df dataframen into csv file

In [14]:
output_file_path = 'region_cleaned_final_data.csv'
df.to_csv(output_file_path, index=False)