In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

def parse_info_txt(file_path, start_line=1):
    """Parse MIAS Info.txt file and return a clean DataFrame."""
    data = []

    with open(file_path, 'r') as f:
        lines = f.readlines()[start_line - 1:]

        for line in lines:
            parts = line.strip().split()

            # Skip invalid lines
            if not parts or not parts[0].startswith("mdb"):
                continue

            try:
                # Parse common fields
                entry = {
                    "ID": parts[0],
                    "Breast Density": parts[1],
                    "Abnormality Type": parts[2] if len(parts) > 2 else "",
                    "Severity": parts[3] if len(parts) > 3 else "",
                    "X": float(parts[4]) if len(parts) > 4 else None,
                    "Y": float(parts[5]) if len(parts) > 5 else None,
                    "Radius": float(parts[6]) if len(parts) > 6 else None,
                }
                data.append(entry)
            except Exception as e:
                print(f"Skipping malformed line: {line.strip()} -> {e}")

    return pd.DataFrame(data)

In [4]:
info_file = "/content/drive/MyDrive/miniMIAS_Dataset/Info.txt"  # Change path if needed

df_info = parse_info_txt(info_file)
df_info.head()

Skipping malformed line: mdb216 D CALC M *NOTE 3* -> could not convert string to float: '*NOTE'
Skipping malformed line: mdb233 G CALC M *NOTE 3* -> could not convert string to float: '*NOTE'
Skipping malformed line: mdb245 F CALC M *NOTE 3* -> could not convert string to float: '*NOTE'


Unnamed: 0,ID,Breast Density,Abnormality Type,Severity,X,Y,Radius
0,mdb001,G,CIRC,B,535.0,425.0,197.0
1,mdb002,G,CIRC,B,522.0,280.0,69.0
2,mdb003,D,NORM,,,,
3,mdb004,D,NORM,,,,
4,mdb005,F,CIRC,B,477.0,133.0,30.0


In [None]:
df_info.to_csv("mias_parsed_info.csv", index=False)

In [10]:
# Filter rows where Abnormality Type is 'CIRC'
df_circ = df_info[df_info["Abnormality Type"] == "CIRC"]

# Print all matching rows
print(df_circ.to_string(index=False))

    ID Breast Density Abnormality Type Severity     X     Y  Radius
mdb001              G             CIRC        B 535.0 425.0   197.0
mdb002              G             CIRC        B 522.0 280.0    69.0
mdb005              F             CIRC        B 477.0 133.0    30.0
mdb005              F             CIRC        B 500.0 168.0    26.0
mdb010              F             CIRC        B 525.0 425.0    33.0
mdb012              F             CIRC        B 471.0 458.0    40.0
mdb015              G             CIRC        B 595.0 864.0    68.0
mdb017              G             CIRC        B 547.0 573.0    48.0
mdb019              G             CIRC        B 653.0 477.0    49.0
mdb021              G             CIRC        B 493.0 125.0    49.0
mdb023              G             CIRC        M 538.0 681.0    29.0
mdb025              F             CIRC        B 674.0 443.0    79.0
mdb028              F             CIRC        M 338.0 314.0    56.0
mdb059              F             CIRC        B 

In [11]:
df_info_filtered = df_info[~(
    df_info["Severity"].isin(["B", "M"]) &
    (df_info["X"].isna() | df_info["Y"].isna() | df_info["Radius"].isna())
)]

In [12]:
# Display all rows (avoid truncation)
pd.set_option('display.max_rows', None)

# Show the DataFrame
print(df_info_filtered)

         ID Breast Density Abnormality Type Severity      X      Y  Radius
0    mdb001              G             CIRC        B  535.0  425.0   197.0
1    mdb002              G             CIRC        B  522.0  280.0    69.0
2    mdb003              D             NORM             NaN    NaN     NaN
3    mdb004              D             NORM             NaN    NaN     NaN
4    mdb005              F             CIRC        B  477.0  133.0    30.0
5    mdb005              F             CIRC        B  500.0  168.0    26.0
6    mdb006              F             NORM             NaN    NaN     NaN
7    mdb007              G             NORM             NaN    NaN     NaN
8    mdb008              G             NORM             NaN    NaN     NaN
9    mdb009              F             NORM             NaN    NaN     NaN
10   mdb010              F             CIRC        B  525.0  425.0    33.0
11   mdb011              F             NORM             NaN    NaN     NaN
12   mdb012              

In [13]:
df_info_filtered.to_csv("filtered_info.csv", index=False)

In [14]:
df_info_filtered.to_csv("/content/drive/MyDrive/MIAS_third_try/filtered_info.csv", index=False)