In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd
import matplotlib.pyplot as plt

import numpy as np; np.random.seed(0)
import seaborn as sns; sns.set_theme()
import os.path


dir0 = '/content/drive/MyDrive/ML1/'
fname_master = dir0 + 'master.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!cd /content/drive/MyDrive/ML1/ && rm train*.csv

rm: cannot remove 'train*.csv': No such file or directory


In [3]:
import os

def url_to_fname(url):
  idx = url.rfind('/')
  if idx == -1:
    base_name = url
    print(base_name)
  else:
    base_name = url[idx+1:]
  return base_name

def file_exists_and_valid_size(url, dir1 = '/content/drive/MyDrive/ML1/datasheets/'):
  base_name = url_to_fname(url)
  fname = '/content/drive/MyDrive/ML1/datasheets/' + base_name
  return os.path.isfile(fname) and (os.path.getsize(fname) > 0)

def url_to_local_fname(url, dir1 = '/content/drive/MyDrive/ML1/datasheets/'):
  base_name = url_to_fname(url)
  return '/content/drive/MyDrive/ML1/datasheets/' + base_name


def replace_invalid_rating(rating):
  if type(rating) != float:
    rating = rating.replace('IP69X', 'IP69K')
  return rating


def process_data(df):
  print(df.shape)

  # remove index so we can see the duplicates
  col = 'Unnamed: 0'
  if col in df.columns:
    df = df.drop(col, axis=1)
  df = df.drop_duplicates()
  print(f'after dropping duplicates, shape={df.shape}')

  # keep rows with labels
  df = df[df['IP Rating'].notna()]

  print(f'after dropping null labels, shape={df.shape}')
  # keep rows with inputs, ideally search the web to find datasheet through part name
  df = df[df['Datasheet'].notna()]
  print(f'after dropping null data sheets, shape={df.shape}')

  # transform a feature
  df['local_datasheet'] = df['Datasheet'].apply(url_to_local_fname)


  # keep rows whose datsheets exist and size is not 0
  df['file_exists'] = df['Datasheet'].apply(file_exists_and_valid_size)
  df = df[df['file_exists'] == True]
  print(f'after dropping rows whose datsheets do not exist and size is 0, shape={df.shape}')

  # replace invalid IP rating (typo) as confimed by datasheets
  df['IP Rating'] = df['IP Rating'].apply(replace_invalid_rating)

  df = df.drop_duplicates(['local_datasheet','IP Rating'], keep= 'last')
  print(f'after dropping rows that share datasheet as well as label, shape={df.shape}')

  dfx = df['local_datasheet'].value_counts().rename_axis('unique_values').to_frame('counts').reset_index()
  repeated_datasheet = list(dfx[dfx['counts'] > 1]['unique_values'])
  df = df[~df['local_datasheet'].isin(repeated_datasheet)]
  print(f'after dropping rows that share datasheet but not label, shape={df.shape}')

  return df#[['local_datasheet', 'IP Rating']].copy()


df = pd.read_csv(fname_master, index_col=None)
df.head(3)
df = process_data(df)
df.columns

(6125, 14)
after dropping duplicates, shape=(4350, 13)
after dropping null labels, shape=(3673, 13)
after dropping null data sheets, shape=(3594, 13)
after dropping rows whose datsheets do not exist and size is 0, shape=(3260, 15)
after dropping rows that share datasheet as well as label, shape=(2489, 15)
after dropping rows that share datasheet but not label, shape=(2472, 15)


Index(['Mouser Part Number', 'Mfr Part Number', 'Mfr.', 'Datasheet',
       'Availability', 'Pricing', 'RoHS', 'Lifecycle', 'Product Detail',
       'IP Rating', 'Product', 'Contact Gender', 'Termination Style',
       'local_datasheet', 'file_exists'],
      dtype='object')

In [4]:
from sklearn.model_selection import ShuffleSplit

splitter = ShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
strat_splits = []
for train_index, test_index in splitter.split(df, df["IP Rating"]):
    strat_train_set_n = df.iloc[train_index]
    strat_test_set_n = df.iloc[test_index]
    strat_splits.append([strat_train_set_n, strat_test_set_n])

df = strat_train_set_n
df.to_csv(dir0 + 'train.csv', index=False)

In [5]:
df = pd.read_csv(dir0 + 'train.csv')
df

Unnamed: 0,Mouser Part Number,Mfr Part Number,Mfr.,Datasheet,Availability,Pricing,RoHS,Lifecycle,Product Detail,IP Rating,Product,Contact Gender,Termination Style,local_datasheet,file_exists
0,651-1523502,"=""1523502""",Phoenix Contact,https://www.mouser.com/datasheet/2/324/4/15235...,4 In Stock,"=""$42.76""",RoHS Compliant By Exemption,,https://www.mouser.com/ProductDetail/Phoenix-C...,IP67,Connectors,Pin (Male),Wire,/content/drive/MyDrive/ML1/datasheets/1523502-...,True
1,654-RT06102PNH,RT06102PNH,Amphenol SINE Systems,https://www.amphenol-sine.com/pdf/datasheet/RT...,29 In Stock,"=""$19.48""",RoHS Compliant,,https://www.mouser.com/ProductDetail/Amphenol-...,IP67,Plugs,Pin (Male),-,/content/drive/MyDrive/ML1/datasheets/RT06102P...,True
2,736-EXG1B305HLN,EXG.1B.305.HLN,LEMO,https://www.mouser.com/datasheet/2/232/EXG_1B_...,19 In Stock,"=""$65.32""",RoHS Compliant By Exemption,,https://www.mouser.com/ProductDetail/LEMO/EXG....,IP50,Connectors,Socket (Female),Print (Straight),/content/drive/MyDrive/ML1/datasheets/EXG_1B_3...,True
3,394-21M0CP10MCC065CS,S21M0C-P10MCC0-65CS,ODU,https://www.mouser.com/datasheet/2/941/ODU_041...,Non-Stocked,"=""$37.47""",RoHS Compliant,,https://www.mouser.com/ProductDetail/ODU/S21M0...,IP50,Connectors,Pin (Male),Solder,/content/drive/MyDrive/ML1/datasheets/ODU_0414...,True
4,394-C10WBMP09XMM0000,C10WBM-P09XMM0-0000,ODU,https://www.mouser.com/datasheet/2/941/C10WBM_...,10 In Stock,"=""$120.29""",RoHS Compliant,New Product,https://www.mouser.com/ProductDetail/ODU/C10WB...,"IP6K8, IP6K9K",Plugs,Pin (Male),Solder,/content/drive/MyDrive/ML1/datasheets/C10WBM_P...,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2219,394-S11MC7P10MCC0397,S11MC7-P10MCC0-3970,ODU,https://www.mouser.com/datasheet/2/941/MEDI_SN...,Non-Stocked Lead-Time 16 Weeks,"=""$24.51""",,,https://www.mouser.com/ProductDetail/ODU/S11MC...,IP50,Connectors,Pin (Male),Solder,/content/drive/MyDrive/ML1/datasheets/MEDI_SNA...,True
2220,651-1559932,"=""1559932""",Phoenix Contact,https://www.mouser.com/datasheet/2/324/4/15599...,5 In Stock,"=""$48.61""",RoHS Compliant By Exemption,,https://www.mouser.com/ProductDetail/Phoenix-C...,"IP65, IP67",Connectors,Pin (Male),Solder,/content/drive/MyDrive/ML1/datasheets/1559932-...,True
2221,394-G11M07P02LPH040,G11M07-P02LPH0-0040,ODU,https://www.mouser.com/datasheet/2/941/MEDI_SN...,19 In Stock,"=""$18.32""",,,https://www.mouser.com/ProductDetail/ODU/G11M0...,IP50,Connectors,Socket (Female),Solder,/content/drive/MyDrive/ML1/datasheets/MEDI_SNA...,True
2222,566-RKC4016-1-1,RKC 40/16 single pk of 1,Lumberg Automation,https://www.mouser.com/datasheet/2/46/RKC3011-...,,"=""$29.67""",RoHS Compliant By Exemption,,https://www.mouser.com/ProductDetail/Lumberg-A...,IP67,Connectors,Socket (Female),Solder,/content/drive/MyDrive/ML1/datasheets/RKC3011-...,True


In [6]:
!sudo apt-get update
!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install pdftotext


Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:6 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [1,085 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:10 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [1,233 kB]
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 2,659 kB in 4s (597 kB/s)
Reading package li

In [7]:



import pdftotext
import re


def find_relevant_string(text, patterns=['IP', 'Ingress', 'Protection', 'International'], length=50):
  res = ''
  for pattern in patterns:
    idx = text.find(pattern)
    indexes = [m.start() for m in re.finditer(pattern, text)]
    for idx in indexes:
      res += ' ' + text[idx: idx+20]
  return res

def read_through_pdftotext(fname):
  with open(fname, "rb") as f:
      pdf = pdftotext.PDF(f)

  # Read all the text into one string
  text = "  ".join(pdf)
  relevant_text = find_relevant_string(text)
  return relevant_text

In [8]:
df['text'] = df['local_datasheet'].apply(read_through_pdftotext)

In [9]:
df1 = df[df['text'].notna()]
df1.to_csv(dir0 + 'text_from_pdf.csv', index=False)

In [10]:
df = pd.read_csv(dir0 + 'text_from_pdf.csv')



In [11]:
df['text'].value_counts()
df['text'] = df['text'].fillna('E')
df['text_len'] = df['text'].apply(lambda s: len(s))
df['text_len'].value_counts()

1       831
42      389
21      321
84      271
63      155
147      78
105      67
168      26
189      25
126      25
210       4
336       2
1575      2
1449      2
2331      2
3948      2
294       2
1405      1
3885      1
252       1
945       1
315       1
651       1
2310      1
273       1
3549      1
2730      1
504       1
2667      1
2142      1
1029      1
231       1
1491      1
3717      1
987       1
840       1
735       1
Name: text_len, dtype: int64