In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('fortune.csv')
df.head(4)

Unnamed: 0.1,Unnamed: 0,Rank,Company,Industry,Address,City,State / Country,Postal Code,Country,"Revenue (in millions, USD)",Employees,Ticker,Website,CEO
0,0,1,Walmart,General Merchandisers,702 S.W. Eighth St.,Bentonville,Arkansas,72716,U.S.,"$572,754",2300000,WMT,www.walmart.com,C. Douglas Mcmillon
1,1,2,Amazon.com,Internet Services and Retailing,410 Terry Ave. N.,Seattle,Washington,98109,U.S.,"$469,822",1608000,AMZN,www.amazon.com,Andrew R. Jassy
2,2,3,State Grid,Utilities,86 Xichang'an Ave.,Beijing,China,100031,China,"$460,617",871145,,www.sgcc.com.cn,
3,3,4,China National Petroleum,Petroleum Refining,9 Dongzhimen N. St.,Beijing,China,100007,China,"$411,693",1090345,,www.cnpc.com.cn,Li Fanrong


# Droping the 'Unnamed: 0' column

In [None]:
df = df.drop('Unnamed: 0',axis=1)

In [None]:
df.head(2)

Unnamed: 0,Rank,Company,Industry,Address,City,State / Country,Postal Code,Country,"Revenue (in millions, USD)",Employees,Ticker,Website,CEO
0,1,Walmart,General Merchandisers,702 S.W. Eighth St.,Bentonville,Arkansas,72716,U.S.,"$572,754",2300000,WMT,www.walmart.com,C. Douglas Mcmillon
1,2,Amazon.com,Internet Services and Retailing,410 Terry Ave. N.,Seattle,Washington,98109,U.S.,"$469,822",1608000,AMZN,www.amazon.com,Andrew R. Jassy


# Checking for any missing values in columns

In [None]:
#Option 1
df.isnull().sum()

Unnamed: 0,0
Rank,0
Company,0
Industry,0
Address,0
City,0
State / Country,0
Postal Code,0
Country,0
"Revenue (in millions, USD)",0
Employees,0


In [None]:
#Option 2
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Rank                        500 non-null    int64 
 1   Company                     500 non-null    object
 2   Industry                    500 non-null    object
 3   Address                     500 non-null    object
 4   City                        500 non-null    object
 5   State / Country             500 non-null    object
 6   Postal Code                 500 non-null    object
 7   Country                     500 non-null    object
 8   Revenue (in millions, USD)  500 non-null    object
 9   Employees                   500 non-null    int64 
 10  Ticker                      313 non-null    object
 11  Website                     500 non-null    object
 12  CEO                         420 non-null    object
dtypes: int64(2), object(11)
memory usage: 50.9+ KB


In [None]:
# 187 null values in 'Ticker'
# 80 null values in 'CEO'

# Finding out the number of unique industries

In [None]:
#Option 1
df['Industry'].unique().size

57

In [None]:
#Option 2
INDUSTRIES = df.groupby('Industry')
INDUSTRIES.size().shape

(57,)

# How many industries in each country

In [None]:
def Details(country):
  mask = df['Country'] == country
  temp_df = df[mask]
  xp = temp_df['Industry'].size
  return f"Number of Industries in {country} : {xp}"

In [None]:
print(Details('China'))
print(Details('U.S.'))
print(Details('India'))
print(Details('Japan'))

Number of Industries in China : 136
Number of Industries in U.S. : 124
Number of Industries in India : 9
Number of Industries in Japan : 47


# Filtering all the companies in U.S.
#AND
# filtering the dataset based on condition that Revenue of the company has to be greater than $450,000

In [None]:
mask1 = df['Country'] == 'U.S.'
df[mask1].head()

Unnamed: 0,Rank,Company,Industry,Address,City,State / Country,Postal Code,Country,"Revenue (in millions, USD)",Employees,Ticker,Website,CEO
0,1,Walmart,General Merchandisers,702 S.W. Eighth St.,Bentonville,Arkansas,72716,U.S.,"$572,754",2300000,WMT,www.walmart.com,C. Douglas Mcmillon
1,2,Amazon.com,Internet Services and Retailing,410 Terry Ave. N.,Seattle,Washington,98109,U.S.,"$469,822",1608000,AMZN,www.amazon.com,Andrew R. Jassy
6,7,Apple,"Computers, Office Equipment",1 Apple Park Way,Cupertino,California,95014,U.S.,"$365,817",154000,AAPL,www.apple.com,Timothy D. Cook
9,10,CVS Health,Health Care: Pharmacy and Other Services,1 CVS Dr.,Woonsocket,Rhode Island,2895,U.S.,"$292,111",258000,CVS,www.cvshealth.com,Karen S. Lynch
10,11,UnitedHealth Group,Health Care: Insurance and Managed Care,9900 Bren Rd. E.,Minnetonka,Minnesota,55343,U.S.,"$287,597",350000,UNH,www.unitedhealthgroup.com,Andrew P. Witty


In [None]:
mask2 = df['Revenue (in millions, USD)'] > '$450,000'
df[mask2].head()

Unnamed: 0,Rank,Company,Industry,Address,City,State / Country,Postal Code,Country,"Revenue (in millions, USD)",Employees,Ticker,Website,CEO
0,1,Walmart,General Merchandisers,702 S.W. Eighth St.,Bentonville,Arkansas,72716,U.S.,"$572,754",2300000,WMT,www.walmart.com,C. Douglas Mcmillon
1,2,Amazon.com,Internet Services and Retailing,410 Terry Ave. N.,Seattle,Washington,98109,U.S.,"$469,822",1608000,AMZN,www.amazon.com,Andrew R. Jassy
2,3,State Grid,Utilities,86 Xichang'an Ave.,Beijing,China,100031,China,"$460,617",871145,,www.sgcc.com.cn,
94,95,Electricité de France,Utilities,22-30 Ave.de Wagram,Paris,France,75008,France,"$99,861",163423,ECIFY,www.edf.fr,Jean-Bernard Levy
95,96,Huawei Investment & Holding,Network & Other Communications Equipment,Huawei Industrial Base,Shenzhen,China,518129,China,"$98,725",195000,,www.huawei.com,Ma Qianli


# Filtering the dataset on condition : Companies having employees greater than 20,00,000 and country being U.S.

In [None]:
mask3 = df['Employees'] > 100000
mask4 = df['Country'] == 'U.S.'

In [None]:
df[mask3 & mask4].head()

Unnamed: 0,Rank,Company,Industry,Address,City,State / Country,Postal Code,Country,"Revenue (in millions, USD)",Employees,Ticker,Website,CEO
0,1,Walmart,General Merchandisers,702 S.W. Eighth St.,Bentonville,Arkansas,72716,U.S.,"$572,754",2300000,WMT,www.walmart.com,C. Douglas Mcmillon
1,2,Amazon.com,Internet Services and Retailing,410 Terry Ave. N.,Seattle,Washington,98109,U.S.,"$469,822",1608000,AMZN,www.amazon.com,Andrew R. Jassy
6,7,Apple,"Computers, Office Equipment",1 Apple Park Way,Cupertino,California,95014,U.S.,"$365,817",154000,AAPL,www.apple.com,Timothy D. Cook
9,10,CVS Health,Health Care: Pharmacy and Other Services,1 CVS Dr.,Woonsocket,Rhode Island,2895,U.S.,"$292,111",258000,CVS,www.cvshealth.com,Karen S. Lynch
10,11,UnitedHealth Group,Health Care: Insurance and Managed Care,9900 Bren Rd. E.,Minnetonka,Minnesota,55343,U.S.,"$287,597",350000,UNH,www.unitedhealthgroup.com,Andrew P. Witty


# Sorting the dataset based on country and revenue


In [None]:
df.sort_values('Country')

Unnamed: 0,Rank,Company,Industry,Address,City,State / Country,Postal Code,Country,"Revenue (in millions, USD)",Employees,Ticker,Website,CEO
271,272,Woolworths Group,Food & Drug Stores,1 Woolworths Way,Bella Vista,Australia,02153,Australia,"$50,211",210067,WOLWF,www.woolworthsgroup.com.au,Brad Banducci
492,493,Coles Group,Food & Drug Stores,800 Toorak Rd.,Hawthorn East,Australia,03123,Australia,"$29,056",120000,CLEGF,www.colesgroup.com.au,Steven Cain
212,213,BHP Group,"Mining, Crude-Oil Production",171 Collins St.,Melbourne,Australia,03000,Australia,"$60,817",34478,BHP,www.bhp.com,Mike Henry
329,330,OMV Group,Petroleum Refining,Trabrennstr. 6-8,Vienna,Austria,01020,Austria,"$42,038",22434,OMVKY,www.omv.com,Alfred Stern
499,500,Umicore,Chemicals,Rue du Marais 31 Broekstraat,Brussels,Belgium,01000,Belgium,"$28,650",11050,UMICY,www.umicore.com,Mathias Miedreich
...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,145,Intel,Semiconductors & Other Electronic Components,2200 Mission College Blvd.,Santa Clara,California,95054,U.S.,"$79,024",121100,INTC,www.intel.com,Patrick P. Gelsinger
142,143,PepsiCo,Food Consumer Products,700 Anderson Hill Rd.,Purchase,New York,10577,U.S.,"$79,474",309000,PEP,www.pepsico.com,Ramon L. Laguarta
140,141,Citigroup,Banks: Commercial and Savings,388 Greenwich St.,New York,New York,10013,U.S.,"$79,865",221768,C,www.citigroup.com,Jane Fraser
167,168,International Business Machines,Information Technology Services,1 New Orchard Rd.,Armonk,New York,10504,U.S.,"$72,344",297800,IBM,www.ibm.com,Arvind Krishna


In [None]:
# Step 1 : convering the Revenue column into numeric column by removing the '$' symbol from the values of it
df['Revenue (in millions, USD)'] =  df['Revenue (in millions, USD)'].str.replace(r'[\$,]', '', regex=True).astype(int)

In [None]:
df

Unnamed: 0,Rank,Company,Industry,Address,City,State / Country,Postal Code,Country,"Revenue (in millions, USD)",Employees,Ticker,Website,CEO
0,1,Walmart,General Merchandisers,702 S.W. Eighth St.,Bentonville,Arkansas,72716,U.S.,572754,2300000,WMT,www.walmart.com,C. Douglas Mcmillon
1,2,Amazon.com,Internet Services and Retailing,410 Terry Ave. N.,Seattle,Washington,98109,U.S.,469822,1608000,AMZN,www.amazon.com,Andrew R. Jassy
2,3,State Grid,Utilities,86 Xichang'an Ave.,Beijing,China,100031,China,460617,871145,,www.sgcc.com.cn,
3,4,China National Petroleum,Petroleum Refining,9 Dongzhimen N. St.,Beijing,China,100007,China,411693,1090345,,www.cnpc.com.cn,Li Fanrong
4,5,Sinopec Group,Petroleum Refining,22 Chaoyangmen N. St.,Beijing,China,100728,China,401314,542286,,www.sinopec.com,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,496,DSV,"Mail, Package and Freight Delivery",Hovedgaden 630,Hedehusene,Denmark,02640,Denmark,28988,77958,DSDVY,www.dsv.com,Jens Bjørn Andersen
496,497,ABB,Industrial Machinery,Affolternstrasse 44,Zurich,Switzerland,08050,Switzerland,28945,104400,ABB,www.abb.com,Bjoern Rosengren
497,498,Mondelez International,Food Consumer Products,905 W. Fulton Market,Chicago,Illinois,60607,U.S.,28720,79000,MDLZ,www.mondelezinternational.com,
498,499,Danone,Food Consumer Products,17 Blvd. Haussmann,Paris,France,75009,France,28708,98105,DANOY,www.danone.com,Emmanuel Faber


In [None]:
# step 2 :- Insert masking condition to filter dataset

mask5 = df['Revenue (in millions, USD)'] > 400000
masked_df = df[mask5]
masked_df

Unnamed: 0,Rank,Company,Industry,Address,City,State / Country,Postal Code,Country,"Revenue (in millions, USD)",Employees,Ticker,Website,CEO
0,1,Walmart,General Merchandisers,702 S.W. Eighth St.,Bentonville,Arkansas,72716,U.S.,572754,2300000,WMT,www.walmart.com,C. Douglas Mcmillon
1,2,Amazon.com,Internet Services and Retailing,410 Terry Ave. N.,Seattle,Washington,98109,U.S.,469822,1608000,AMZN,www.amazon.com,Andrew R. Jassy
2,3,State Grid,Utilities,86 Xichang'an Ave.,Beijing,China,100031,China,460617,871145,,www.sgcc.com.cn,
3,4,China National Petroleum,Petroleum Refining,9 Dongzhimen N. St.,Beijing,China,100007,China,411693,1090345,,www.cnpc.com.cn,Li Fanrong
4,5,Sinopec Group,Petroleum Refining,22 Chaoyangmen N. St.,Beijing,China,100728,China,401314,542286,,www.sinopec.com,
5,6,Saudi Aramco,"Mining, Crude-Oil Production",P.O. Box 5000,Dhahran,Saudi Arabia,31311,Saudi Arabia,400399,68493,,www.aramco.com,Amin H. Nasser


# Groupby to find out total number of employees, average revenue, number of industries

In [None]:
def Country_Details(country):
  mask6 = df['Country'] == country
  temp_df = df[mask6]
  x = temp_df['Industry'].unique().size
  y = temp_df['Employees'].sum()
  z = temp_df['Revenue (in millions, USD)'].sum() / x
  print(f"Total employees :- {y}")
  print(f"Average revenue of Industries in {country} is : {z}")
  print(f"The total number of Industries in {country} : {x}")

In [None]:
Country_Details("U.S.")

Total employees :- 18718398
Average revenue of Industries in U.S. is : 233693.54166666666
The total number of Industries in U.S. : 48


# Feature Engineering : Generating a new column

In [None]:
temp = df
def myfunc(emp):
  if emp > 1000000:
    return 'Enormous'
  elif emp > 500000 and emp < 1000000:
    return 'Medium'
  else :
    return 'Small'


temp['Size cateogory'] = temp['Employees'].apply(myfunc)
temp.head(3)

Unnamed: 0,Rank,Company,Industry,Address,City,State / Country,Postal Code,Country,"Revenue (in millions, USD)",Employees,Ticker,Website,CEO,CEO_lastname,Size Cateogory,Size cateogory
0,1,Walmart,General Merchandisers,702 S.W. Eighth St.,Bentonville,Arkansas,72716,U.S.,572754,2300000,WMT,www.walmart.com,C. Douglas Mcmillon,C. Douglas Mcmillon,Enormous,Enormous
1,2,Amazon.com,Internet Services and Retailing,410 Terry Ave. N.,Seattle,Washington,98109,U.S.,469822,1608000,AMZN,www.amazon.com,Andrew R. Jassy,Andrew R. Jassy,Enormous,Enormous
2,3,State Grid,Utilities,86 Xichang'an Ave.,Beijing,China,100031,China,460617,871145,,www.sgcc.com.cn,,,Medium,Medium
