In [5]:
import pandas as pd

In [6]:
df = pd.read_csv("azfc_labels.csv")

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5820 entries, 0 to 5819
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        5820 non-null   int64 
 1   measurement_date  5820 non-null   object
 2   machine_name      5820 non-null   object
 3   bearing_fault     5820 non-null   int64 
 4   status            5820 non-null   object
 5   speed             5820 non-null   object
 6   zone_number       5820 non-null   int64 
dtypes: int64(3), object(4)
memory usage: 318.4+ KB


In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,measurement_date,machine_name,bearing_fault,status,speed,zone_number
0,0,1/4/2016,P-010-02 A Turbo-generator station pump,0,L1,1489.2,3
1,1,1/4/2016,P-010-02 B Turbo-generator station pump,1,L3,1492.8,3
2,2,1/18/2016,1403 Main air blower,0,L2,2998.8,3
3,3,1/18/2016,1901 Turb 6 Acc,0,L1,7590.0,3
4,4,3/6/2016,Air RecFan 1.12402A,1,L2,1487.0,1


In [9]:
df.columns

Index(['Unnamed: 0', 'measurement_date', 'machine_name', 'bearing_fault',
       'status', 'speed', 'zone_number'],
      dtype='object')

* Dropping unnecessary index column

In [10]:
df = df.drop("Unnamed: 0", axis=1)

* Calculating the means for every machine

In [11]:
means = {machine: 0 for machine in df["machine_name"].unique()}
print("All these machines don't have a single valid speed value:\n")
for machine, group in df.groupby("machine_name"):
    sum = 0
    count = 0
    for speed in group["speed"]:
        try:
            sum += float(speed.replace(",", ""))
            count += 1
        except ValueError:
            if speed.find("â€¬") > 0:
                sum += float(speed.replace("â€¬", "").replace(",", ""))
                count += 1
            continue
    if count > 0:
        means[machine] = sum / count
    else:
        print(machine)

All these machines don't have a single valid speed value:

Maturation Tank Agitator 15MX802-1 Gearbox,Maturation Tank Agitator 15MX802-1 Motor
Maturation Tank Agitator 15MX802-2 Gearbox,Maturation Tank Agitator 15MX802-2 Motor
Oleum Tank Pump 1425
P-012-02A Acid Storage Pump-A
P-012-02B Acid Storage Pump-B
Reactor Agitator 15MX801 Motor,Reactor Agitator 15MX801 Gearbox


* Replacing (ERROR ,â€¬ ,of) values with its machine's mean

In [12]:
df.loc[df["speed"] == "1489.2.2", "speed"] = "1489.2"

for i, row in df.iterrows():
    try:
        float(row["speed"])
    except ValueError:
        if row["speed"].find(",") or row["speed"].find("â€¬") > 0:
            df.loc[i, "speed"] = row["speed"].replace(",", "").replace("â€¬", "")
        if row["speed"] == "ERROR" or row["speed"] == "of" or row["speed"] == "â€¬":
            df.loc[i, "speed"] = means[row["machine_name"]]

* Fixing the datatype of the columns so it can be used and processed easier

In [13]:
df["speed"] = df["speed"].astype(float)

In [14]:
df["measurement_date"] = pd.to_datetime(df["measurement_date"], format="%m/%d/%Y")

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5820 entries, 0 to 5819
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   measurement_date  5820 non-null   datetime64[ns]
 1   machine_name      5820 non-null   object        
 2   bearing_fault     5820 non-null   int64         
 3   status            5820 non-null   object        
 4   speed             5820 non-null   float64       
 5   zone_number       5820 non-null   int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 272.9+ KB


In [16]:
df.head()

Unnamed: 0,measurement_date,machine_name,bearing_fault,status,speed,zone_number
0,2016-01-04,P-010-02 A Turbo-generator station pump,0,L1,1489.2,3
1,2016-01-04,P-010-02 B Turbo-generator station pump,1,L3,1492.8,3
2,2016-01-18,1403 Main air blower,0,L2,2998.8,3
3,2016-01-18,1901 Turb 6 Acc,0,L1,7590.0,3
4,2016-03-06,Air RecFan 1.12402A,1,L2,1487.0,1


* Writing the edited data to a new csv file

In [17]:
df.to_csv("fixed_data.csv", index=False)