#  Data Science From Scratch - Joel Grus(O'REILLY)

In [1]:
users = [
    {"id": 0, "name": "Hero"},
    {"id": 1, "name": "Dunn"},
    {"id": 2, "name": "Sue"},
    {"id": 3, "name": "Chi"},
    {"id": 4, "name": "Thor"},
    {"id": 5, "name": "Clive"},
    {"id": 6, "name": "Hicks"},
    {"id": 7, "name": "Devin"},
    {"id": 8, "name": "Kate"},
    {"id": 9, "name": "Klein"}
]

In [2]:
friendship_pairs = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4), (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]

In [3]:
# Initialize the dict with an empty list for each user id:
friendships = {user["id"]: [] for user in users}
print(friendships)


{0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}


In [4]:
# And loop over the friendship pairs to populate it:
for i,j in friendship_pairs:
    friendships[i].append(j) # Add j as a friend of user i
    friendships[j].append(i) # Add i as a friend of user j

In [5]:
# Finding total number of connections

def number_of_friends(user):
    """How many friends does _user have?"""
    user_id = user["id"]
    friend_ids = friendships[user_id]
    return len(friend_ids)

total_connections = sum(number_of_friends(user) for user in users)

In [6]:
# Average number of connections

num_users = len(users) # length of the users list
avg_connections = total_connections / num_users # 24 / 10 = 2.4


In [7]:
# finding most friends and least friends

# Creating a list (user_id, number_of_friends).
num_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]

num_friends_by_id.sort(key=lambda id_and_friends: id_and_friends[1], reverse=True) # Sort the list -> by num_friends -> largest to smallest

In [8]:
# friends of a friends(easy method but not optimised)
def foaf_ids_bad(user):
    """foaf is short for "friend of a friend" """
    return [foaf_id for friend_id in friendships[user["id"]] for foaf_id in friendships[friend_id]]

for user in users:
    print(foaf_ids_bad(user)) 
# if we see for 1st user -> it includes users 0 twice, Hence is indeed friends with both of his friends, It include users 1 and 2  

[0, 2, 3, 0, 1, 3]
[1, 2, 0, 1, 3, 1, 2, 4]
[1, 2, 0, 2, 3, 1, 2, 4]
[0, 2, 3, 0, 1, 3, 3, 5]
[1, 2, 4, 4, 6, 7]
[3, 5, 5, 8, 5, 8]
[4, 6, 7, 6, 7, 9]
[4, 6, 7, 6, 7, 9]
[5, 8, 5, 8, 8]
[6, 7, 9]


In [9]:
# foaf , exclude people already known to the user (wnad only friends of friends, excluding me myself and my friends)

from collections import Counter  # not loaded by default

def friends_of_friends(user):
    user_id = user["id"]
    return Counter(foaf_id 
    for friend_id in friendships[user_id]       # For each of my friends
    for foaf_id in friendships[friend_id]       # find their friends
    if foaf_id != user_id                       # who are't me
    and foaf_id not in friendships[user_id])    # and are't my friends

for user in users:
    print(friends_of_friends(user)) # ({x: y}), where x represent friends of friends where y represent 

Counter({3: 2})
Counter({4: 1})
Counter({4: 1})
Counter({0: 2, 5: 1})
Counter({1: 1, 2: 1, 6: 1, 7: 1})
Counter({8: 2, 3: 1})
Counter({7: 2, 4: 1, 9: 1})
Counter({6: 2, 4: 1, 9: 1})
Counter({5: 2})
Counter({6: 1, 7: 1})


In [10]:
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQl"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodel"), (2, "pandas"), (3, "R"), (3, "Python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")    
]

In [11]:
# List of users who share the interest same as target_interest
def data_scientists_who_like(target_interest):
    """find the ids of all users who like the target interest."""
    return [user_id for user_id, user_interest in interests if user_interest == target_interest] 
# give the output but it need to examine entire list again and again so, not optimised


In [12]:
# Interests, Optimized
from collections import defaultdict

# Keys are interests, values are lists of user_ids with that interest
user_ids_by_interest = defaultdict(list)
 
for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)

# Keys are user_ids, values are lists of interests for that user_id
interests_by_user_id = defaultdict(list)
 
for user_id, interest in interests:
    interests_by_user_id[user_id].append(interest)


def most_common_interests_with(user):
    return Counter(
        interested_user_id
        for interest in interests_by_user_id[user["id"]]
        for interested_user_id in user_ids_by_interest[interest]
        if interested_user_id != user["id"]
    )

for user in users:
    print(most_common_interests_with(user))

Counter({9: 3, 1: 2, 8: 1, 5: 1})
Counter({0: 2})
Counter({3: 1, 5: 1, 7: 1})
Counter({5: 2, 6: 2, 2: 1, 4: 1})
Counter({7: 1, 3: 1})
Counter({3: 2, 2: 1, 0: 1, 9: 1})
Counter({3: 2})
Counter({4: 1, 2: 1, 8: 1})
Counter({7: 1, 0: 1, 9: 1})
Counter({0: 3, 5: 1, 8: 1})


In [13]:
salaries_and_tenures = [
    (83000, 8.7), (88000, 8.1),  #(salary, experince in years)
    (48000, 0.7), (76000, 6),
    (69000, 6.5), (76000, 7.5),
    (60000, 2.5), (83000, 10),
    (48000, 1.9), (63000, 4.2)
]


In [14]:
# Average salary for each tenure

# Keys are years, values are lists of the salaries for each tenure.
salary_by_tenure = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    salary_by_tenure[tenure].append(salary)

# Keys are years, each value is average salary for that tensure
average_salary_by_tenure = {
    tenure: sum(salaries) / len(salaries) for tenure, salaries in salary_by_tenure.items()
}

# won't be much useful because none of the users will have same tenure, means we're just reporting the individual user's salaries.


In [15]:
# Taking the average of the salaries, more helpful tyo bucket the tenures

def tenure_bucket(tenure):
    if tenure < 2:
        return "less than two"
    elif tenure < 5:
        return "between two and five"
    else:
        return "more than five"

In [16]:
# Keys are tenure buckets, values are lists of salaries for that bucket
salary_by_tenure_bucket = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure_bucket[bucket].append(salary)

# Keys are tenure buckets, cakues are average salary for that bucket
average_salary_by_bucket = {
    tenure_bucket: sum(salaries) / len(salaries) for tenure_bucket, salaries in salary_by_tenure_bucket.items()
}

print(average_salary_by_bucket)

{'more than five': 79166.66666666667, 'less than two': 48000.0, 'between two and five': 61500.0}


In [17]:
# Finding the most common interest based on the count of the words

words_and_counts = Counter(word for user, interest in interests for word in interest.lower().split())

# Showing only the once which came more than once
for word, count in words_and_counts.most_common():
    if count > 1:
        print(word, count)

big 3
data 3
java 3
python 3
learning 3
hadoop 2
hbase 2
cassandra 2
scikit-learn 2
r 2
statistics 2
regression 2
probability 2
machine 2
neural 2
networks 2
