In [1]:
# read and work with data

import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('cali_avocados.csv')

In [3]:
print("Number of rows: ",df.shape)
print("Value in series missing: ",df.isnull().values.any()) # determine if value in series missing
print("""How many missing values:""")
print(df.isnull().sum()) # how many missing values

Number of rows:  (420, 11)
Value in series missing:  False
How many missing values:
Year               0
 Commodity Code    0
 Crop Name         0
 County Code       0
Country            0
Harvested_Acres    0
Yield              0
Production         0
PriceP/U           0
 Unit              0
Value              0
dtype: int64


In [4]:
df = df.drop(" Crop Name", axis='columns')
df = df.drop(" Unit", axis='columns')

df.head(20)

Unnamed: 0,Year,Commodity Code,County Code,Country,Harvested_Acres,Yield,Production,PriceP/U,Value
0,2020,221999,53,Monterey,223,5.56,1240.0,2379.84,2951000
1,2020,221999,65,Riverside,3020,,,,88697000
2,2020,221999,71,San Bernardino,370,2.16,799.0,2617.02,2091000
3,2020,221998,73,San Diego,14400,3.51,50500.0,3028.87,152958000
4,2020,221999,79,San Luis Obispo,4240,5.9,25000.0,1886.76,47169000
5,2020,221999,83,Santa Barbara,5770,4.89,28200.0,2842.59,80161000
6,2020,221999,111,Ventura,16400,4.29,70300.0,2556.57,179727000
7,2019,221999,53,Monterey,225,6.58,1480.0,2500.0,3700000
8,2019,221999,65,Riverside,2940,5.48,16100.0,2505.53,40339000
9,2019,221999,71,San Bernardino,397,3.53,1400.0,2604.29,3646000


In [5]:
# transform all data to numeric, integer, float (and others)

df['Harvested_Acres'].replace('  ', np.nan, inplace=True)
df['Yield'].replace('  ', np.nan, inplace=True)
df['Production'].replace('  ', np.nan, inplace=True)
df['PriceP/U'].replace('  ', np.nan, inplace=True)

df['Year'] = pd.to_datetime(df['Year'], format='%Y')

df['Country'] = pd.factorize(df.Country)[0]

df.dropna(subset=['Harvested_Acres'], inplace=True)
df['Harvested_Acres'] = df['Harvested_Acres'].astype(int)

df.dropna(subset=['Yield'], inplace=True)
df['Yield'] = df['Yield'].astype(float)

df.dropna(subset=['Production'], inplace=True)
df['Production'] = df['Production'].astype(int)

df.dropna(subset=['PriceP/U'], inplace=True)
df['PriceP/U'] = df['PriceP/U'].astype(float)

df.dropna(subset=['Value'], inplace=True)
df['Value'] = df['Value'].astype(int)

df.dtypes

Year               datetime64[ns]
 Commodity Code             int64
 County Code                int64
Country                     int64
Harvested_Acres             int64
Yield                     float64
Production                  int64
PriceP/U                  float64
Value                       int64
dtype: object

In [6]:
df_outl = df.iloc[:, 4:9]

# Find outliers after operations with data

Q1=df_outl.quantile(0.30)
Q3=df_outl.quantile(0.70)

print(Q1)
print(Q3)

Harvested_Acres        369.800
Yield                    2.129
Production            1101.700
PriceP/U              1078.382
Value              1462400.000
Name: 0.3, dtype: float64
Harvested_Acres    7.663000e+03
Yield              3.621000e+00
Production         2.146350e+04
PriceP/U           2.004095e+03
Value              3.534841e+07
Name: 0.7, dtype: float64


In [7]:
df_outl.describe()

Unnamed: 0,Harvested_Acres,Yield,Production,PriceP/U,Value
count,394.0,394.0,394.0,394.0,394.0
mean,6461.71066,2.938452,19203.357868,1576.153553,31211840.0
std,8666.252328,1.395931,28270.407115,765.200147,45703710.0
min,31.0,0.14,29.0,163.0,9000.0
25%,238.5,1.87,826.5,877.75,1065750.0
50%,1641.5,2.9,5243.5,1618.0,8563150.0
75%,8457.25,3.85,26340.0,2112.255,42337550.0
max,36800.0,8.6,160574.0,3958.08,251452100.0


In [2]:
prec = 98/(98+143)

print(prec)

0.4066390041493776
