In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [6]:
url="https://raw.githubusercontent.com/dsrscientist/dataset3/main/glass.csv"
data=pd.read_csv(url)

In [7]:
data

Unnamed: 0,1,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.00.1,1.1
0,2,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.00,1
1,3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.00,1
2,4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.00,1
3,5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.00,1
4,6,1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.00,0.26,1
...,...,...,...,...,...,...,...,...,...,...,...
208,210,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.00,7
209,211,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.00,7
210,212,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.00,7
211,213,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.00,7


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213 entries, 0 to 212
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   1        213 non-null    int64  
 1   1.52101  213 non-null    float64
 2   13.64    213 non-null    float64
 3   4.49     213 non-null    float64
 4   1.10     213 non-null    float64
 5   71.78    213 non-null    float64
 6   0.06     213 non-null    float64
 7   8.75     213 non-null    float64
 8   0.00     213 non-null    float64
 9   0.00.1   213 non-null    float64
 10  1.1      213 non-null    int64  
dtypes: float64(9), int64(2)
memory usage: 18.4 KB


In [12]:
data.columns

Index(['1', '1.52101', '13.64', '4.49', '1.10', '71.78', '0.06', '8.75',
       '0.00', '0.00.1', '1.1'],
      dtype='object')

In [13]:
# Check for missing values
print(data.isnull().sum())

1          0
1.52101    0
13.64      0
4.49       0
1.10       0
71.78      0
0.06       0
8.75       0
0.00       0
0.00.1     0
1.1        0
dtype: int64


In [28]:
# Split the data into features and target
X = data.drop("1.1", axis=1)
y = data["1.1"]

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
# Train a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_scaled, y_train)

RandomForestClassifier(random_state=42)

In [27]:
# Evaluate the model
y_pred = clf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9767441860465116


In [20]:
classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_rep)

Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00        10
           2       1.00      0.93      0.97        15
           3       0.75      1.00      0.86         3
           5       1.00      1.00      1.00         3
           6       1.00      1.00      1.00         3
           7       1.00      1.00      1.00         9

    accuracy                           0.98        43
   macro avg       0.96      0.99      0.97        43
weighted avg       0.98      0.98      0.98        43



In [26]:
# Generate predicted values for the test set
y_pred = clf.predict(X_test_scaled)


In [22]:
y_pred

array([1, 7, 1, 7, 2, 2, 1, 2, 2, 3, 6, 5, 2, 2, 6, 5, 7, 1, 1, 7, 2, 7,
       7, 7, 3, 2, 1, 1, 6, 1, 1, 2, 3, 2, 2, 7, 5, 3, 2, 2, 2, 7, 1],
      dtype=int64)

In [25]:
#Create a new DataFrame with original data and predicted values
new_table = X_test.copy()  # Create a copy of the test data
new_table["Actual Type"] = y_test.values  # Add actual types to the new table
new_table["Predicted Type"] = y_pred  # Add predicted types to the new table

In [24]:
# Display the new table
print(new_table)

       1  1.52101  13.64  4.49  1.10  71.78  0.06   8.75  0.00  0.00.1  \
9     11  1.51571  12.72  3.46  1.56  73.20  0.67   8.09  0.00    0.24   
197  199  1.51531  14.38  0.00  2.66  73.10  0.04   9.08  0.64    0.00   
66    68  1.52152  13.05  3.65  0.87  72.32  0.19   9.85  0.00    0.17   
191  193  1.51623  14.20  0.00  2.79  73.46  0.04   9.04  0.40    0.09   
117  119  1.51673  13.30  3.64  1.53  72.53  0.65   8.03  0.00    0.29   
111  113  1.52777  12.64  0.00  0.67  72.02  0.06  14.40  0.00    0.00   
15    17  1.51784  12.68  3.67  1.16  73.11  0.61   8.70  0.00    0.00   
86    88  1.51645  13.40  3.49  1.52  72.65  0.67   8.08  0.00    0.10   
75    77  1.51645  13.44  3.61  1.54  72.39  0.66   8.03  0.00    0.00   
144  146  1.51839  12.85  3.67  1.24  72.57  0.62   8.68  0.00    0.35   
182  184  1.51969  14.56  0.00  0.56  73.48  0.00  11.22  0.00    0.00   
170  172  1.51316  13.02  0.00  3.04  70.48  6.21   6.96  0.00    0.00   
141  143  1.51662  12.85  3.51  1.44  