# Análisis Estático

## Importación de librerias y carga de datos

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('onlinefoods.csv')

## Descripción general

In [3]:
df.shape

(388, 13)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         388 non-null    int64  
 1   Gender                      388 non-null    object 
 2   Marital Status              388 non-null    object 
 3   Occupation                  388 non-null    object 
 4   Monthly Income              388 non-null    object 
 5   Educational Qualifications  388 non-null    object 
 6   Family size                 388 non-null    int64  
 7   latitude                    388 non-null    float64
 8   longitude                   388 non-null    float64
 9   Pin code                    388 non-null    int64  
 10  Output                      388 non-null    object 
 11  Feedback                    388 non-null    object 
 12  Unnamed: 12                 388 non-null    object 
dtypes: float64(2), int64(3), object(8)


In [5]:
df.head()

Unnamed: 0,Age,Gender,Marital Status,Occupation,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Pin code,Output,Feedback,Unnamed: 12
0,20,Female,Single,Student,No Income,Post Graduate,4,12.9766,77.5993,560001,Yes,Positive,Yes
1,24,Female,Single,Student,Below Rs.10000,Graduate,3,12.977,77.5773,560009,Yes,Positive,Yes
2,22,Male,Single,Student,Below Rs.10000,Post Graduate,3,12.9551,77.6593,560017,Yes,Negative,Yes
3,22,Female,Single,Student,No Income,Graduate,6,12.9473,77.5616,560019,Yes,Positive,Yes
4,22,Male,Single,Student,Below Rs.10000,Post Graduate,4,12.985,77.5533,560010,Yes,Positive,Yes


## Tratamiendo de nulos


In [6]:
df.isna().sum()

Age                           0
Gender                        0
Marital Status                0
Occupation                    0
Monthly Income                0
Educational Qualifications    0
Family size                   0
latitude                      0
longitude                     0
Pin code                      0
Output                        0
Feedback                      0
Unnamed: 12                   0
dtype: int64

#### Observación: en caso de existir elementos nulos hacer tratamiento

## Tratamiento de duplicados

### Columnas

In [7]:
df.columns.duplicated()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [8]:
df.rename(columns={df.columns[-1]: "Unnamed"}, inplace=True)

In [9]:
df["Unnamed"].equals(df["Output"])

True

In [10]:
df.drop(columns=["Unnamed"], inplace=True)
df.head()

Unnamed: 0,Age,Gender,Marital Status,Occupation,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Pin code,Output,Feedback
0,20,Female,Single,Student,No Income,Post Graduate,4,12.9766,77.5993,560001,Yes,Positive
1,24,Female,Single,Student,Below Rs.10000,Graduate,3,12.977,77.5773,560009,Yes,Positive
2,22,Male,Single,Student,Below Rs.10000,Post Graduate,3,12.9551,77.6593,560017,Yes,Negative
3,22,Female,Single,Student,No Income,Graduate,6,12.9473,77.5616,560019,Yes,Positive
4,22,Male,Single,Student,Below Rs.10000,Post Graduate,4,12.985,77.5533,560010,Yes,Positive


### Filas

In [11]:
duplicated = df.duplicated(keep="first")
print("Número de duplicados: ",duplicated.sum())

Número de duplicados:  103


In [12]:
duplicated_groups = df[df.duplicated()]
grouped_duplicates = duplicated_groups.groupby(list(df.columns))

for group_name, group_df in grouped_duplicates:
    print("Group:", group_name)
    print(group_df)

Group: (20, 'Male', 'Single', 'Student', 'No Income', 'Graduate', 2, 12.9261, 77.6221, 560034, 'Yes', 'Positive')
     Age Gender Marital Status Occupation Monthly Income  \
332   20   Male         Single    Student      No Income   
348   20   Male         Single    Student      No Income   

    Educational Qualifications  Family size  latitude  longitude  Pin code  \
332                   Graduate            2   12.9261    77.6221    560034   
348                   Graduate            2   12.9261    77.6221    560034   

    Output  Feedback  
332    Yes  Positive  
348    Yes  Positive  
Group: (21, 'Female', 'Single', 'Student', 'No Income', 'Post Graduate', 3, 12.9149, 77.5635, 560070, 'Yes', 'Positive')
     Age  Gender Marital Status Occupation Monthly Income  \
313   21  Female         Single    Student      No Income   

    Educational Qualifications  Family size  latitude  longitude  Pin code  \
313              Post Graduate            3   12.9149    77.5635    560070   



In [13]:
df.drop_duplicates(keep="first", inplace=True)

In [14]:
print("Numero de filas: ", df.shape[0])

Numero de filas:  285


## Indicadores

In [15]:
df.describe()

Unnamed: 0,Age,Family size,latitude,longitude,Pin code
count,285.0,285.0,285.0,285.0,285.0
mean,24.677193,3.270175,12.973429,77.597593,560037.280702
std,3.040977,1.361178,0.043964,0.053557,30.738306
min,18.0,1.0,12.8652,77.4842,560001.0
25%,23.0,2.0,12.9438,77.5635,560010.0
50%,24.0,3.0,12.977,77.5877,560028.0
75%,26.0,4.0,12.998,77.6227,560066.0
max,33.0,6.0,13.102,77.7582,560109.0


In [16]:
df.dtypes

Age                             int64
Gender                         object
Marital Status                 object
Occupation                     object
Monthly Income                 object
Educational Qualifications     object
Family size                     int64
latitude                      float64
longitude                     float64
Pin code                        int64
Output                         object
Feedback                       object
dtype: object

In [17]:

columns = df.columns.to_list()

for i in columns:
    u_values = df[i].unique()
    max_length_value = df[i].apply(lambda x: len(str(x))).max()
    min_length_value = df[i].apply(lambda x: len(str(x))).min()
    print("="*20)
    print(f"{i}")
    print("="*20)
    
    if u_values.size < 5:
        for k in u_values:
            print("- ", k)
    else:
        for k in range(5):
            print("- ", u_values[k])
        print("...")

    print(f"\nMax. longitud de valor: {max_length_value}")
    print(f"Max. longitud de valor: {min_length_value}")
    print(f"Total: {u_values.size}\n")
    
        

Age
-  20
-  24
-  22
-  27
-  23
...

Max. longitud de valor: 2
Max. longitud de valor: 2
Total: 16

Gender
-  Female
-  Male

Max. longitud de valor: 6
Max. longitud de valor: 4
Total: 2

Marital Status
-  Single
-  Married
-  Prefer not to say

Max. longitud de valor: 17
Max. longitud de valor: 6
Total: 3

Occupation
-  Student
-  Employee
-  Self Employeed
-  House wife

Max. longitud de valor: 14
Max. longitud de valor: 7
Total: 4

Monthly Income
-  No Income
-  Below Rs.10000
-  More than 50000
-  10001 to 25000
-  25001 to 50000
...

Max. longitud de valor: 15
Max. longitud de valor: 9
Total: 5

Educational Qualifications
-  Post Graduate
-  Graduate
-  Ph.D
-  Uneducated
-  School
...

Max. longitud de valor: 13
Max. longitud de valor: 4
Total: 5

Family size
-  4
-  3
-  6
-  2
-  5
...

Max. longitud de valor: 1
Max. longitud de valor: 1
Total: 6

latitude
-  12.9766
-  12.977
-  12.9551
-  12.9473
-  12.985
...

Max. longitud de valor: 7
Max. longitud de valor: 6
Total: 77



In [18]:
for i in columns:
    print(f"{"="*10}\n{df[i].value_counts()}\n")

Age
23    47
25    41
22    40
24    37
26    27
21    18
27    16
28    12
32    12
30     8
20     7
31     7
29     7
19     4
18     1
33     1
Name: count, dtype: int64

Gender
Male      164
Female    121
Name: count, dtype: int64

Marital Status
Single               189
Married               87
Prefer not to say      9
Name: count, dtype: int64

Occupation
Student           144
Employee           94
Self Employeed     38
House wife          9
Name: count, dtype: int64

Monthly Income
No Income          131
25001 to 50000      52
More than 50000     47
10001 to 25000      36
Below Rs.10000      19
Name: count, dtype: int64

Educational Qualifications
Graduate         126
Post Graduate    125
Ph.D              21
School            11
Uneducated         2
Name: count, dtype: int64

Family size
3    87
2    75
4    45
5    37
6    23
1    18
Name: count, dtype: int64

latitude
12.9770    22
12.9783    11
12.9698     8
12.9850     8
12.9343     7
           ..
13.0262     1
13.0223   

## Save changes

In [19]:
df.to_csv("clean_data.csv", index=False)