In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("http://dbpedia.org/sparql")

sparql.setQuery("""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?label
    WHERE { <http://dbpedia.org/resource/Asturias> rdfs:label ?label }
""")
sparql.setReturnFormat(JSON)
results = sparql.query()
results.print_results()

print

sparql.setQuery("""
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dbpo: <http://dbpedia.org/property/>
SELECT ?subdivision ?label 
WHERE { 
  <http://dbpedia.org/resource/Asturias> dbpo:subdivisionName ?subdivision .
  ?subdivision rdfs:label ?label .
}
""")
sparql.setReturnFormat(JSON)
results = sparql.query()
results.print_results()

In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import re
import csv

# Checking Routines

In [2]:
import os

In [4]:
def check_frame(df_in, columns, column_types):
    df = df_in.copy()
    for i,j in zip(columns,column_types):
        print('Currently Processing:',i,j)
        arr = df[i].tolist()
        if j == 'year':
            year_arr, flag = check_year(arr)
            df[i] = year_arr
            if not flag:
                print('Some years are missing.')
        elif j == 'cat':
            check_categorical(arr)
        elif j == 'num':
            num_arr, flag = check_numeric(arr)
            df[i] = num_arr
            if not flag:
                print('Some values are missing.')
        elif j == 'none':
            print('Column is dropped.')
            df = df.drop(i, 1)
    return df

In [5]:
def check_numeric(arr):
    flag = True
    float_arr, float_flag = check_float(arr)
    num_arr = float_arr
    if float_flag:        
        int_arr, int_flag = check_int(float_arr)
        if int_flag:
            print("Array is Integer.")
            num_arr = int_arr
            flag = int_flag
        else:
            print("Array is Float.")
            flag = float_flag
    
    return num_arr, flag
    
def check_int(arr):
    flag = True
    int_arr = []
    for flo in arr:
        bool_int = flo.is_integer()
        if bool_int:
            int_flo = int(flo)
            int_arr.append(int_flo)
        else:
            int_arr.append(np.NaN)
            flag = False
        
    return int_arr, flag

def check_float(arr):
    flag = True
    float_arr = []
    for s in arr:
        try:
            float_s = float(s.replace(',', ''))
            float_arr.append(float_s)
        except ValueError:
            float_arr.append(np.NaN)
            flag = False
        
    return float_arr, flag

def check_categorical(arr):
    print("Categories:")
    cnt = Counter(arr)
    print(cnt.items())
    print
    return

def check_year(arr):
    flag = True
    yr_arr = []
    int_arr, flag = check_numeric(arr)
    for yr in int_arr:
        if yr >= 1975 and yr <= 2016:
            yr_arr.append(yr)
        else:
            yr_arr.append(np.NaN)
            flag = False
    return yr_arr, flag

In [None]:
owd = os.getcwd()
filepath_in = "../raw_csv_storage/"
os.chdir(filepath_in)
filepath_out = "../checked_csv_storage/"
files = os.listdir(os.getcwd())
for filename in files:
    df = pd.read_csv(filename,dtype='str')
    cols = df.columns.tolist()
    print('Processing file:', filename)
    print(cols)
    print('Please specify the types of these columns. Options are: year, \
cat (categorical), num (numeric), none (column is not needed).')
    col_types = []
    for col in cols:
        print(col)
        s = raw_input()
        col_types.append(s)
    df_proc = check_frame(df,cols,col_types)
    df_proc.to_csv(filepath_out + filename)

    
    
os.chdir(owd)

In [43]:
filepath = "../raw_csv_storage/"
filename = "crime.csv"
crime_df = pd.read_csv(filepath + filename,dtype='str')
columns = crime_df.columns.tolist()
column_types = ['year','cat','cat','cat','num','none']

In [47]:
df = check_frame(crime_df,columns,column_types)

('Currently Processing:', 'TIME', 'year')
Array is Integer.
('Currently Processing:', 'GEO', 'cat')
Categories:
[('Turkey', 84), ('Italy', 84), ('Czech Republic', 84), ('Lithuania', 84), ('Luxembourg', 84), ('France', 84), ('Slovakia', 84), ('Ireland', 84), ('England and Wales', 84), ('Norway', 84), ('Scotland', 84), ('Iceland', 84), ('Montenegro', 84), ('Slovenia', 84), ('Bosnia and Herzegovina', 84), ('Belgium', 84), ('Kosovo (under United Nations Security Council Resolution 1244/99)', 84), ('Spain', 84), ('Germany (until 1990 former territory of the FRG)', 84), ('Netherlands', 84), ('Denmark', 84), ('Poland', 84), ('Finland', 84), ('Northern Ireland (UK)', 84), ('Sweden', 84), ('Latvia', 84), ('Croatia', 84), ('Malta', 84), ('Switzerland', 84), ('Bulgaria', 84), ('Romania', 84), ('Albania', 84), ('Portugal', 84), ('Estonia', 84), ('Former Yugoslav Republic of Macedonia, the', 84), ('Serbia', 84), ('Liechtenstein', 84), ('Austria', 84), ('Greece', 84), ('Hungary', 84), ('Cyprus', 84)

In [48]:
for i,j in zip(df['Value'],crime_df['Value']):
    print(i,j)

(203.0, '203')
(1.8999999999999999, '1.90')
(76314.0, '76,314')
(715.42999999999995, '715.43')
(10779.0, '10,779')
(101.05, '101.05')
(3231.0, '3,231')
(30.289999999999999, '30.29')
(7548.0, '7,548')
(70.760000000000005, '70.76')
(245682.0, '245,682')
(2303.23, '2,303.23')
(172.0, '172')
(2.29, '2.29')
(3216.0, '3,216')
(42.780000000000001, '42.78')
(749.0, '749')
(9.9600000000000009, '9.96')
(262.0, '262')
(3.48, '3.48')
(487.0, '487')
(6.4800000000000004, '6.48')
(42553.0, '42,553')
(566.00999999999999, '566.01')
(113.0, '113')
(1.0900000000000001, '1.09')
(17875.0, '17,875')
(172.81999999999999, '172.82')
(1680.0, '1,680')
(16.239999999999998, '16.24')
(529.0, '529')
(5.1100000000000003, '5.11')
(1151.0, '1,151')
(11.130000000000001, '11.13')
(166085.0, '166,085')
(1605.71, '1,605.71')
(54.0, '54')
(0.98999999999999999, '0.99')
(11203.0, '11,203')
(204.59, '204.59')
(1837.0, '1,837')
(33.549999999999997, '33.55')
(396.0, '396')
(7.2300000000000004, '7.23')
(1441.0, '1,441')
(26.32, 

In [9]:
df.to_csv('crime_processed.csv')

In [33]:
def check_numeric(arr):
    flag = True
    float_arr, float_flag = check_float(arr)
    num_arr = float_arr
    if float_flag:        
        int_arr, int_flag = check_int(float_arr)
        if int_flag:
            print("Array is Integer.")
            num_arr = int_arr
            flag = int_flag
        else:
            print("Array is Float.")
            flag = float_flag
    
    return num_arr, flag
    
def check_int(arr):
    flag = True
    int_arr = []
    for flo in arr:
        bool_int = flo.is_integer()
        if bool_int:
            int_flo = int(flo)
            int_arr.append(int_flo)
        else:
            int_arr.append(np.NaN)
            flag = False
        
    return int_arr, flag

def check_float(arr):
    flag = True
    float_arr = []
    for s in arr:
        try:
            float_s = float(s.replace(',', ''))
            float_arr.append(float_s)
        except ValueError:
            float_arr.append(np.NaN)
            flag = False
        
    return float_arr, flag

def check_categorical(arr):
    print("Categories:")
    cnt = Counter(arr)
    print(cnt.items())
    print
    return

def check_year(arr):
    flag = True
    yr_arr = []
    int_arr, flag = check_numeric(arr)
    for yr in int_arr:
        if yr >= 1975 and yr <= 2016:
            yr_arr.append(yr)
        else:
            yr_arr.append(np.NaN)
            flag = False
    return yr_arr, flag

In [54]:
py = raw_input()

Hello


In [55]:
type(py)

str