In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
df=pd.read_csv('tortilla_prices.csv')

In [4]:
df

Unnamed: 0,State,City,Year,Month,Day,Store type,Price per kilogram
0,Aguascalientes,Aguascalientes,2007,1,10,Mom and Pop Store,9.90
1,Baja California,Mexicali,2007,1,10,Mom and Pop Store,
2,Baja California,Tijuana,2007,1,10,Mom and Pop Store,10.00
3,Baja California Sur,La Paz,2007,1,10,Mom and Pop Store,10.00
4,Campeche,Campeche,2007,1,10,Mom and Pop Store,10.00
...,...,...,...,...,...,...,...
278881,Veracruz,Coatzacoalcos,2024,3,1,Big Retail Store,12.50
278882,Veracruz,Veracruz,2024,3,1,Big Retail Store,13.57
278883,Veracruz,Xalapa,2024,3,1,Big Retail Store,13.97
278884,Yucatán,Mérida,2024,3,1,Big Retail Store,13.20


In [5]:
#missing values
missing_percentages = df.isna().sum().sort_values(ascending=False)
missing_percentages

Price per kilogram    6390
State                    0
City                     0
Year                     0
Month                    0
Day                      0
Store type               0
dtype: int64

In [6]:
df.dropna(inplace=True)

In [7]:
#missing values
missing_percentages = df.isna().sum().sort_values(ascending=False)
missing_percentages

State                 0
City                  0
Year                  0
Month                 0
Day                   0
Store type            0
Price per kilogram    0
dtype: int64

In [8]:
df["City"].unique()

array(['Aguascalientes', 'Tijuana', 'La\xa0Paz', 'Campeche',
       'Piedras\xa0Negras', 'Colima', 'Tuxtla\xa0Gutiérrez', 'Chihuahua',
       'D.F.', 'ZM\xa0D.F.', 'Durango', 'León', 'Acapulco', 'Pachuca',
       'Guadalajara', 'Toluca', 'Morelia', 'Cuernavaca', 'Tepic',
       'Monterrey', 'Oaxaca', 'Puebla', 'Querétaro', 'Cancún', 'Chetumal',
       'San\xa0Luis\xa0Potosí', 'Culiacán', 'Cd.\xa0Obregón',
       'Hermosillo', 'Villahermosa', 'Cd.\xa0Victoria', 'Tlaxcala',
       'Coatzacoalcos', 'Poza\xa0Rica', 'Veracruz', 'Xalapa', 'Mérida',
       'Zacatecas', 'Matamoros', 'Nuevo\xa0Laredo', 'Reynosa', 'Mexicali',
       'Saltillo', 'Tapachula', 'Cd.\xa0Juárez', 'Gómez\xa0Palacio',
       'Irapuato', 'Chilpancingo', 'ZM\xa0Guadalajara', 'ZM\xa0Monterrey',
       'ZM\xa0Puebla', 'Nogales', 'Tampico', 'Torreón', 'Celaya',
       'San\xa0Luis\xa0Río\xa0Colorado'], dtype=object)

In [9]:
df["Store type"].unique()

array(['Mom and Pop Store', 'Big Retail Store'], dtype=object)

In [10]:
df["State"].unique()

array(['Aguascalientes', 'Baja\xa0California',
       'Baja\xa0California\xa0Sur', 'Campeche', 'Coahuila', 'Colima',
       'Chiapas', 'Chihuahua', 'D.F.', 'Durango', 'Guanajuato',
       'Guerrero', 'Hidalgo', 'Jalisco', 'Edo.\xa0México', 'Michoacán',
       'Morelos', 'Nayarit', 'Nuevo\xa0León', 'Oaxaca', 'Puebla',
       'Querétaro', 'Quintana\xa0Roo', 'San\xa0Luis\xa0Potosí', 'Sinaloa',
       'Sonora', 'Tabasco', 'Tamaulipas', 'Tlaxcala', 'Veracruz',
       'Yucatán', 'Zacatecas'], dtype=object)

In [11]:
label_encoder = LabelEncoder()
df['City_encoded'] = label_encoder.fit_transform(df['City'])
df['State_encoded'] = label_encoder.fit_transform(df['State'])
df['Store type_encoded'] = label_encoder.fit_transform(df['Store type'])


In [12]:
df = df.drop('City', axis=1)
df = df.drop('State', axis=1)
df = df.drop('Store type', axis=1)

In [13]:
df

Unnamed: 0,Year,Month,Day,Price per kilogram,City_encoded,State_encoded,Store type_encoded
0,2007,1,10,9.90,1,0,1
2,2007,1,10,10.00,43,1,1
3,2007,1,10,10.00,21,2,1
4,2007,1,10,10.00,2,3,1
5,2007,1,10,10.00,32,6,1
...,...,...,...,...,...,...,...
278881,2024,3,1,12.50,11,29,0
278882,2024,3,1,13.57,48,29,0
278883,2024,3,1,13.97,50,29,0
278884,2024,3,1,13.20,27,30,0


In [14]:
target = df[['Price per kilogram']].copy()
target

Unnamed: 0,Price per kilogram
0,9.90
2,10.00
3,10.00
4,10.00
5,10.00
...,...
278881,12.50
278882,13.57
278883,13.97
278884,13.20


In [15]:
df = df.drop('Price per kilogram', axis=1)
df

Unnamed: 0,Year,Month,Day,City_encoded,State_encoded,Store type_encoded
0,2007,1,10,1,0,1
2,2007,1,10,43,1,1
3,2007,1,10,21,2,1
4,2007,1,10,2,3,1
5,2007,1,10,32,6,1
...,...,...,...,...,...,...
278881,2024,3,1,11,29,0
278882,2024,3,1,48,29,0
278883,2024,3,1,50,29,0
278884,2024,3,1,27,30,0


In [16]:
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=42)

In [17]:
# Display the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (217996, 6)
X_test shape: (54500, 6)
y_train shape: (217996, 1)
y_test shape: (54500, 1)


In [19]:
# Saving
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

In [24]:
from sklearn.linear_model import LogisticRegression


model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

  y = column_or_1d(y, warn=True)


ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [23]:
    # Calculate accuracy
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy:", accuracy)
    
    # Calculate and print other metrics
    print("Classification Report:")
    print(classification_report(y_test, predictions))

ValueError: continuous is not supported